diff --git a/.circleci/config.yml b/.circleci/config.yml index d9440d63..40651290 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -28,7 +28,7 @@ version: 2.1 export CC=\"ccache ${CC}\" && pip3 install -r requirements.txt && mkdir test-results && - cd test && python3 -m pytest --verbose --junitxml=test-results/pytest/results.xml --numprocesses=auto" + cd test && python3 -m pytest --verbose --junitxml=test-results/pytest/results.xml --numprocesses=2" no_output_timeout: 2h - save_cache: key: v1-ccache-{{ .Environment.CIRCLE_JOB }} @@ -59,7 +59,7 @@ version: 2.1 pip3 install -r requirements.txt mkdir test-results cd test - python3 -m pytest --verbose --junitxml=test-results/pytest/results.xml --numprocesses=auto + python3 -m pytest --verbose --junitxml=test-results/pytest/results.xml --numprocesses=2 no_output_timeout: 2h - store_test_results: path: test/test-results diff --git a/crypto_kem/mceliece348864/META.yml b/crypto_kem/mceliece348864/META.yml new file mode 100644 index 00000000..5f1ee953 --- /dev/null +++ b/crypto_kem/mceliece348864/META.yml @@ -0,0 +1,48 @@ +name: Classic McEliece 348864 +type: kem +claimed-nist-level: 1 +claimed-security: IND-CCA2 +length-public-key: 261120 +length-secret-key: 6452 +length-ciphertext: 128 +length-shared-secret: 32 +nistkat-sha256: f0a166a9115a0c8481c85aee3fe901729a21a8a84a5d2b871fb99fc50223046b +principal-submitters: + - Daniel J. Bernstein + - Tung Chou + - Tanja Lange + - Ingo von Maurich + - Rafael Misoczki + - Ruben Niederhagen + - Edoardo Persichetti + - Christiane Peters + - Peter Schwabe + - Nicolas Sendrier + - Jakub Szefer + - Wen Wang +auxiliary-submitters: [] +implementations: + - name: clean + version: SUPERCOP-20191221 + - name: vec + version: SUPERCOP-20191221 + - name: sse + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - sse4_1 + - popcnt + - name: avx + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - popcnt diff --git a/crypto_kem/mceliece348864/avx/LICENSE b/crypto_kem/mceliece348864/avx/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece348864/avx/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece348864/avx/Makefile b/crypto_kem/mceliece348864/avx/Makefile new file mode 100644 index 00000000..54ab3480 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/Makefile @@ -0,0 +1,42 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece348864_avx.a + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c int32_sort.c operations.c pk_gen.c sk_gen.c transpose.c \ + util.c uint32_sort.o vec.c vec128.c vec256.c \ + consts.S syndrome_asm.S transpose_64x256_sp_asm.S \ + transpose_64x64_asm.S update_asm.S vec128_mul_asm.S vec256_mul_asm.S \ + vec_mul_asm.S vec_mul_sp_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h int32_sort.h \ + params.h pk_gen.h sk_gen.h transpose.h uint32_sort.h util.h \ + vec128.h vec256.h vec.h \ + consts.inc powers.inc scalars_2x.inc scalars.inc + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o int32_sort.o operations.o pk_gen.o sk_gen.o transpose.o \ + util.o uint32_sort.o vec.o vec128.o vec256.o \ + consts.o syndrome_asm.o transpose_64x256_sp_asm.o \ + transpose_64x64_asm.o update_asm.o vec128_mul_asm.o vec256_mul_asm.o \ + vec_mul_asm.o vec_mul_sp_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mpopcnt -mavx2 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece348864/avx/aes256ctr.c b/crypto_kem/mceliece348864/avx/aes256ctr.c new file mode 100644 index 00000000..44ff4f32 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE348864_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece348864/avx/aes256ctr.h b/crypto_kem/mceliece348864/avx/aes256ctr.h new file mode 100644 index 00000000..74213af2 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_AES256CTR_H +#define PQCLEAN_MCELIECE348864_AVX_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE348864_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece348864/avx/api.h b/crypto_kem/mceliece348864/avx/api.h new file mode 100644 index 00000000..15f2b6ec --- /dev/null +++ b/crypto_kem/mceliece348864/avx/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_API_H +#define PQCLEAN_MCELIECE348864_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_ALGNAME "Classic McEliece 348864" +#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_PUBLICKEYBYTES 261120 +#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_SECRETKEYBYTES 6452 +#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_CIPHERTEXTBYTES 128 +#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE348864_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE348864_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE348864_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/benes.c b/crypto_kem/mceliece348864/avx/benes.c new file mode 100644 index 00000000..d1723880 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/benes.c @@ -0,0 +1,287 @@ +/* + This file is for Benes network related functions +*/ +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_0(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = bs[ x ] ^ bs[ x + 1 ]; + diff &= *cond++; + bs[ x ] ^= diff; + bs[ x + 1 ] ^= diff; + } +} + +static void layer_1(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = bs[ x + 0 ] ^ bs[ x + 2 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 2 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 3 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 3 ] ^= diff; + + cond += 2; + } +} + +static void layer_2(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = bs[ x + 0 ] ^ bs[ x + 4 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 4 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 5 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 5 ] ^= diff; + + diff = bs[ x + 2 ] ^ bs[ x + 6 ]; + diff &= cond[2]; + bs[ x + 2 ] ^= diff; + bs[ x + 6 ] ^= diff; + + diff = bs[ x + 3 ] ^ bs[ x + 7 ]; + diff &= cond[3]; + bs[ x + 3 ] ^= diff; + bs[ x + 7 ] ^= diff; + + cond += 4; + } +} + +static void layer_3(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 8 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 8 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 9 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 9 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 10 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 10 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 11 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 11 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_4(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 16 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 16 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 17 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 17 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 18 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 18 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 19 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 19 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_5(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 32 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 32 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 33 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 33 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 34 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 34 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 35 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 35 ] ^= diff; + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: out, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE348864_AVX_load_bits(uint64_t out[][32], const unsigned char *bits) { + int i, low, block = 0; + + uint64_t cond[64]; + + // + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_AVX_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864_AVX_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864_AVX_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 4; low >= 0; low--) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864_AVX_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 5; low >= 0; low--) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_AVX_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864_AVX_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } +} + +/* input: r, sequence of bits to be permuted */ +/* cond, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE348864_AVX_benes(uint64_t *r, uint64_t cond[][32], int rev) { + int block, inc; + + uint64_t *bs = r; + + // + + if (rev == 0) { + block = 0; + inc = 1; + } else { + block = 22; + inc = -1; + } + + PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs); + + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + //block += inc; + + PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs); +} + diff --git a/crypto_kem/mceliece348864/avx/benes.h b/crypto_kem/mceliece348864/avx/benes.h new file mode 100644 index 00000000..a417328e --- /dev/null +++ b/crypto_kem/mceliece348864/avx/benes.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_BENES_H +#define PQCLEAN_MCELIECE348864_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "gf.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864_AVX_load_bits(uint64_t /*out*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE348864_AVX_benes(uint64_t * /*r*/, uint64_t /*cond*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/bm.c b/crypto_kem/mceliece348864/avx/bm.c new file mode 100644 index 00000000..666a48c5 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/bm.c @@ -0,0 +1,219 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "util.h" +#include "vec.h" +#include "vec128.h" + +#include + +extern void PQCLEAN_MCELIECE348864_AVX_update_asm(void *, gf, int); +extern gf PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm(uint64_t *); + +static inline uint64_t mask_nonzero(gf a) { + uint64_t ret = a; + + ret -= 1; + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline uint64_t mask_leq(uint16_t a, uint16_t b) { + uint64_t a_tmp = a; + uint64_t b_tmp = b; + uint64_t ret = b_tmp - a_tmp; + + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline void vec_cmov(uint64_t out[][2], uint64_t mask) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i][0] = (out[i][0] & ~mask) | (out[i][1] & mask); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE348864_AVX_vec128_or(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE348864_AVX_vec128_or(PQCLEAN_MCELIECE348864_AVX_vec128_srl_2x(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < GFBITS; i++) { + buf[i] = in[i]; + } + for (i = GFBITS; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE348864_AVX_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_AVX_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_AVX_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +void PQCLEAN_MCELIECE348864_AVX_bm(uint64_t out[ GFBITS ], vec128 in[ GFBITS ]) { + uint16_t i; + uint16_t N, L; + + uint64_t prod[ GFBITS ]; + uint64_t in_tmp[ GFBITS ]; + + uint64_t db[ GFBITS ][ 2 ]; + uint64_t BC_tmp[ GFBITS ][ 2 ]; + uint64_t BC[ GFBITS ][ 2 ]; + + uint64_t mask, t; + + gf d, b, c0 = 1; + + gf coefs[SYS_T * 2]; + + // init + + BC[0][1] = 0; + BC[0][0] = 1; + BC[0][0] <<= 63; + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = 0; + } + + b = 1; + L = 0; + + // + + get_coefs(coefs, in); + + for (i = 0; i < GFBITS; i++) { + in_tmp[i] = 0; + } + + for (N = 0; N < SYS_T * 2; N++) { + // computing d + + PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(prod, in_tmp, &BC[0][0]); + + PQCLEAN_MCELIECE348864_AVX_update_asm(in_tmp, coefs[N], 8); + + d = PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE348864_AVX_gf_mul2(c0, coefs[N], b); + + d ^= t & 0xFFFFFFFF; + + // 3 cases + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = (d >> i) & 1; + db[i][0] = -db[i][0]; + db[i][1] = (b >> i) & 1; + db[i][1] = -db[i][1]; + } + + PQCLEAN_MCELIECE348864_AVX_vec128_mul((vec128 *) BC_tmp, (vec128 *) db, (vec128 *) BC); + + vec_cmov(BC, mask); + + PQCLEAN_MCELIECE348864_AVX_update_asm(BC, mask & c0, 16); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = BC_tmp[i][0] ^ BC_tmp[i][1]; + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + + } + + c0 = PQCLEAN_MCELIECE348864_AVX_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + out[i] = (c0 >> i) & 1; + out[i] = -out[i]; + } + + PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(out, out, &BC[0][0]); +} + diff --git a/crypto_kem/mceliece348864/avx/bm.h b/crypto_kem/mceliece348864/avx/bm.h new file mode 100644 index 00000000..dd45f432 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_BM_H +#define PQCLEAN_MCELIECE348864_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE348864_AVX_bm(uint64_t * /*out*/, vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/consts.S b/crypto_kem/mceliece348864/avx/consts.S new file mode 100644 index 00000000..385ad55a --- /dev/null +++ b/crypto_kem/mceliece348864/avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE348864_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE348864_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE348864_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE348864_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE348864_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE348864_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE348864_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE348864_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE348864_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE348864_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE348864_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE348864_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE348864_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE348864_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE348864_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE348864_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE348864_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE348864_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE348864_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE348864_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE348864_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE348864_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE348864_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE348864_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece348864/avx/consts.inc b/crypto_kem/mceliece348864/avx/consts.inc new file mode 100644 index 00000000..73b38122 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/consts.inc @@ -0,0 +1,238 @@ +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555, 0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33333333CCCCCCCC, 0x33333333CCCCCCCC, 0x33333333CCCCCCCC, 0x33333333CCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF, 0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55, 0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0x33CC33CCCC33CC33, 0x33CC33CCCC33CC33), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF0000FF00FFFF00, 0xFF0000FF00FFFF00, 0x00FFFF00FF0000FF, 0x00FFFF00FF0000FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3, 0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555, 0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F, 0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC33333333CCCC, 0x3333CCCCCCCC3333, 0x3333CCCCCCCC3333, 0xCCCC33333333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6699669999669966, 0x9966996666996699, 0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F, 0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6699669999669966, 0x9966996666996699, 0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0, 0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F, 0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, diff --git a/crypto_kem/mceliece348864/avx/controlbits.c b/crypto_kem/mceliece348864/avx/controlbits.c new file mode 100644 index 00000000..d9cfce14 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE348864_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE348864_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE348864_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE348864_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece348864/avx/controlbits.h b/crypto_kem/mceliece348864/avx/controlbits.h new file mode 100644 index 00000000..2574c5f6 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE348864_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE348864_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE348864_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/crypto_hash.h b/crypto_kem/mceliece348864/avx/crypto_hash.h new file mode 100644 index 00000000..d3ca396a --- /dev/null +++ b/crypto_kem/mceliece348864/avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece348864/avx/decrypt.c b/crypto_kem/mceliece348864/avx/decrypt.c new file mode 100644 index 00000000..c8c59d41 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/decrypt.c @@ -0,0 +1,234 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + uint64_t sk_int[ GFBITS ]; + vec256 eval[16][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE348864_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE348864_AVX_fft(eval, sk_int); + + for (i = 0; i < 16; i++) { + PQCLEAN_MCELIECE348864_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE348864_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 16; i++) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864_AVX_vec256_inv(tmp, inv[15]); + + for (i = 14; i >= 0; i--) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 16; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 512 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 512; i++) { + r[i] = 0; + } + + for (i = 0; i < 32; i++) { + recv[i] = PQCLEAN_MCELIECE348864_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE348864_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE348864_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 16; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 32; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec128 *s0, vec128 *s1) { + int i; + vec128 diff; + + diff = PQCLEAN_MCELIECE348864_AVX_vec128_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE348864_AVX_vec128_or(diff, PQCLEAN_MCELIECE348864_AVX_vec128_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE348864_AVX_vec128_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 16; i++) { + v[0] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 16; i++) { + v[0] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE348864_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 16 ][ GFBITS ]; + vec256 scaled[ 16 ][ GFBITS ]; + vec256 eval[16][ GFBITS ]; + + vec128 error128[ 32 ]; + vec256 error256[ 16 ]; + + vec128 s_priv[ GFBITS ]; + vec128 s_priv_cmp[ GFBITS ]; + uint64_t locator[ GFBITS ]; + + vec128 recv128[ 32 ]; + vec256 recv256[ 16 ]; + vec256 allone; + + uint64_t bits_int[23][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE348864_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE348864_AVX_benes((uint64_t *) recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE348864_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE348864_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE348864_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE348864_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 16; i++) { + error256[i] = PQCLEAN_MCELIECE348864_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE348864_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE348864_AVX_benes((uint64_t *) error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece348864/avx/decrypt.h b/crypto_kem/mceliece348864/avx/decrypt.h new file mode 100644 index 00000000..3b479a53 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE348864_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE348864_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/encrypt.c b/crypto_kem/mceliece348864/avx/encrypt.c new file mode 100644 index 00000000..800c6d73 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/encrypt.c @@ -0,0 +1,99 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "gf.h" +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE348864_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE348864_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE348864_AVX_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE348864_AVX_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece348864/avx/encrypt.h b/crypto_kem/mceliece348864/avx/encrypt.h new file mode 100644 index 00000000..0f54a316 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE348864_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE348864_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/fft.c b/crypto_kem/mceliece348864/avx/fft.c new file mode 100644 index 00000000..b8845956 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/fft.c @@ -0,0 +1,172 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "vec.h" + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(uint64_t *in) { + int i, j, k; + + const uint64_t mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const uint64_t s[5][GFBITS] = { +#include "scalars.inc" + }; + + // + + for (j = 0; j <= 4; j++) { + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[i] ^= (in[i] & mask[k][0]) >> (1 << k); + in[i] ^= (in[i] & mask[k][1]) >> (1 << k); + } + } + + PQCLEAN_MCELIECE348864_AVX_vec_mul(in, in, s[j]); // scaling + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], const uint64_t *in) { + int i, j, k, s, b; + + uint64_t t0, t1, t2, t3; + + const vec256 consts[ 17 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 0; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + // boradcast + + vec256 tmp256[ GFBITS ]; + vec256 x[ GFBITS ], y[ GFBITS ]; + + for (j = 0; j < 64; j += 8) { + for (i = 0; i < GFBITS; i++) { + t0 = (in[i] >> reversal[j + 0]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 2]) & 1; + t1 = -t1; + t2 = (in[i] >> reversal[j + 4]) & 1; + t2 = -t2; + t3 = (in[i] >> reversal[j + 6]) & 1; + t3 = -t3; + + out[j / 4 + 0][i] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(t0, t1, t2, t3); + + t0 = (in[i] >> reversal[j + 1]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 3]) & 1; + t1 = -t1; + t2 = (in[i] >> reversal[j + 5]) & 1; + t2 = -t2; + t3 = (in[i] >> reversal[j + 7]) & 1; + t3 = -t3; + + out[j / 4 + 1][i] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(t0, t1, t2, t3); + } + } + + // + + for (i = 0; i < 16; i += 2) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, out[i + 1], consts[ 0 ]); + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] ^= tmp256[b]; + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] ^= out[i + 0][b]; + } + } + + for (i = 0; i < 16; i += 2) { + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(out[i + 0][b], out[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(out[i + 0][b], out[i + 1][b]); + } + + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, y, consts[ 1 ]); + + for (b = 0; b < GFBITS; b++) { + x[b] ^= tmp256[b]; + } + for (b = 0; b < GFBITS; b++) { + y[b] ^= x[b]; + } + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(x[b], y[b]); + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(x[b], y[b]); + } + } + + consts_ptr = 2; + + for (i = 0; i <= 3; i++) { + s = 1 << i; + + for (j = 0; j < 16; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] ^= tmp256[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += s; + } + + // adding the part contributed by x^64 + + vec256 powers[16][GFBITS] = { +#include "powers.inc" + }; + + for (i = 0; i < 16; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(out[i][b], powers[i][b]); + } + } +} + +void PQCLEAN_MCELIECE348864_AVX_fft(vec256 out[][ GFBITS ], uint64_t *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece348864/avx/fft.h b/crypto_kem/mceliece348864/avx/fft.h new file mode 100644 index 00000000..75a7b44f --- /dev/null +++ b/crypto_kem/mceliece348864/avx/fft.h @@ -0,0 +1,18 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_FFT_H +#define PQCLEAN_MCELIECE348864_AVX_FFT_H + +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include + +#include "params.h" +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE348864_AVX_fft(vec256 /*out*/[][GFBITS], uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/fft_tr.c b/crypto_kem/mceliece348864/avx/fft_tr.c new file mode 100644 index 00000000..23b72a7b --- /dev/null +++ b/crypto_kem/mceliece348864/avx/fft_tr.c @@ -0,0 +1,355 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" +#include "vec.h" + +#include + +static void radix_conversions_tr(vec128 in[ GFBITS ]) { + int i, j, k; + + const vec128 mask[10] = { + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + uint64_t v0, v1; + + // + + for (j = 5; j >= 0; j--) { + + if (j < 5) { + PQCLEAN_MCELIECE348864_AVX_vec128_mul(in, in, s[j]); + } + + for (i = 0; i < GFBITS; i++) { + for (k = j; k <= 4; k++) { + in[i] ^= PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(in[i] & mask[2 * k + 0], 1 << k); + in[i] ^= PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(in[i] & mask[2 * k + 1], 1 << k); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[i], 0); + v1 = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[i], 1); + + v1 ^= v0 >> 32; + v1 ^= v1 << 32; + + in[i] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(v0, v1); + } + } +} + +static void butterflies_tr(vec128 out[ GFBITS ], vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + uint64_t tmp[ GFBITS ]; + uint64_t pre[6][ GFBITS ]; + uint64_t out64[2][64]; + + vec256 p2[ 6 ]; + vec256 buf[64]; + vec256 x[ GFBITS ], y[ GFBITS ]; + vec256 tmp256[ GFBITS ]; + + const vec256 consts[ 17 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 17; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {8, 1300, 3408, 1354, 2341, 1154}; + + // butterflies + + for (i = 3; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 16; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, in[k], consts[ consts_ptr + (k - j) ]); + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tmp256[b]; + } + } + } + } + + for (i = 0; i < 16; i += 2) { + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(in[i + 0][b], in[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(in[i + 0][b], in[i + 1][b]); + } + + for (b = 0; b < GFBITS; b++) { + x[b] ^= y[b]; + } + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, x, consts[ 1 ]); + for (b = 0; b < GFBITS; b++) { + y[b] ^= tmp256[b]; + } + + for (b = 0; b < GFBITS; b++) { + in[i + 0][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(x[b], y[b]); + } + for (b = 0; b < GFBITS; b++) { + in[i + 1][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(x[b], y[b]); + } + } + + for (i = 0; i < 16; i += 2) { + for (b = 0; b < GFBITS; b++) { + in[i + 0][b] ^= in[i + 1][b]; + } + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, in[i + 0], consts[ 0 ]); + for (b = 0; b < GFBITS; b++) { + in[i + 1][b] ^= tmp256[b]; + } + } + + // transpose + + for (i = 0; i < GFBITS; i += 4) { + for (j = 0; j < 64; j += 8) { + buf[ reversal[j + 0] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 0), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 0), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 0), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 0)); + buf[ reversal[j + 1] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 0), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 0), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 0), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 0)); + buf[ reversal[j + 2] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 1), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 1), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 1), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 1)); + buf[ reversal[j + 3] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 1), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 1), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 1), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 1)); + buf[ reversal[j + 4] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 2), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 2), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 2), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 2)); + buf[ reversal[j + 5] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 2), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 2), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 2), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 2)); + buf[ reversal[j + 6] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 3), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 3), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 3), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 3)); + buf[ reversal[j + 7] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 3), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 3), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 3), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 3)); + } + + PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp(buf); + + p2[0] = buf[32]; + buf[33] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[33], buf[32]); + p2[1] = buf[33]; + buf[35] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[35], buf[33]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[35]); + buf[34] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[34], buf[35]); + p2[2] = buf[34]; + buf[38] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[38], buf[34]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[38]); + buf[39] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[39], buf[38]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[39]); + buf[37] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[37], buf[39]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[37]); + buf[36] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[36], buf[37]); + p2[3] = buf[36]; + buf[44] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[44], buf[36]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[44]); + buf[45] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[45], buf[44]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[45]); + buf[47] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[47], buf[45]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[47]); + buf[46] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[46], buf[47]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[46]); + buf[42] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[42], buf[46]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[42]); + buf[43] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[43], buf[42]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[43]); + buf[41] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[41], buf[43]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[41]); + buf[40] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[40], buf[41]); + p2[4] = buf[40]; + buf[56] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[56], buf[40]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[56]); + buf[57] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[57], buf[56]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[57]); + buf[59] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[59], buf[57]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[59]); + buf[58] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[58], buf[59]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[58]); + buf[62] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[62], buf[58]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[62]); + buf[63] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[63], buf[62]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[63]); + buf[61] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[61], buf[63]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[61]); + buf[60] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[60], buf[61]); + p2[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[3], buf[60]); + buf[52] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[52], buf[60]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[52]); + buf[53] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[53], buf[52]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[53]); + buf[55] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[55], buf[53]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[55]); + buf[54] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[54], buf[55]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[54]); + buf[50] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[50], buf[54]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[50]); + buf[51] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[51], buf[50]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[51]); + buf[49] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[49], buf[51]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[49]); + buf[48] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[48], buf[49]); + p2[5] = buf[48]; + buf[16] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[16], buf[48]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[16]); + buf[17] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[17], buf[16]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[17]); + buf[19] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[19], buf[17]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[19]); + buf[18] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[18], buf[19]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[18]); + buf[22] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[22], buf[18]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[22]); + buf[23] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[23], buf[22]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[23]); + buf[21] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[21], buf[23]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[21]); + buf[20] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[20], buf[21]); + p2[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[3], buf[20]); + buf[28] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[28], buf[20]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[28]); + buf[29] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[29], buf[28]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[29]); + buf[31] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[31], buf[29]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[31]); + buf[30] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[30], buf[31]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[30]); + buf[26] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[26], buf[30]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[26]); + buf[27] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[27], buf[26]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[27]); + buf[25] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[25], buf[27]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[25]); + buf[24] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[24], buf[25]); + p2[4] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[4], buf[24]); + buf[8] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[8], buf[24]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[8]); + buf[9] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[9], buf[8]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[9]); + buf[11] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[11], buf[9]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[11]); + buf[10] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[10], buf[11]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[10]); + buf[14] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[14], buf[10]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[14]); + buf[15] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[15], buf[14]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[15]); + buf[13] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[13], buf[15]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[13]); + buf[12] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[12], buf[13]); + p2[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[3], buf[12]); + buf[4] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[4], buf[12]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[4]); + buf[5] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[5], buf[4]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[5]); + buf[7] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[7], buf[5]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[7]); + buf[6] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[6], buf[7]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[6]); + buf[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[2], buf[6]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[2]); + buf[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[3], buf[2]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[3]); + buf[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[1], buf[3]); + + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[1]); + buf[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[0], buf[1]); + + for (j = 0; j < 6; j++) { + pre[j][i + 0] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 0); + pre[j][i + 1] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 1); + pre[j][i + 2] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 2); + pre[j][i + 3] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 3); + } + + out64[0][i + 0] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 0); + out64[0][i + 1] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 1); + out64[0][i + 2] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 2); + out64[0][i + 3] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 3); + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = (beta[0] >> j) & 1; + tmp[j] = -tmp[j]; + } + + PQCLEAN_MCELIECE348864_AVX_vec_mul(out64[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = (beta[i] >> j) & 1; + tmp[j] = -tmp[j]; + } + + PQCLEAN_MCELIECE348864_AVX_vec_mul(tmp, pre[i], tmp); + PQCLEAN_MCELIECE348864_AVX_vec_add(out64[1], out64[1], tmp); + } + + for (i = 0; i < GFBITS; i++) { + out[i] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(out64[0][i], out64[1][i]); + } +} + +void PQCLEAN_MCELIECE348864_AVX_fft_tr(vec128 out[GFBITS], vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece348864/avx/fft_tr.h b/crypto_kem/mceliece348864/avx/fft_tr.h new file mode 100644 index 00000000..5442858f --- /dev/null +++ b/crypto_kem/mceliece348864/avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE348864_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE348864_AVX_fft_tr(vec128 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/gf.c b/crypto_kem/mceliece348864/avx/gf.c new file mode 100644 index 00000000..dca11df8 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/gf.c @@ -0,0 +1,169 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE348864_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 20; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE348864_AVX_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE348864_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint32_t tmp; + uint32_t t0; + uint32_t t1; + uint32_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & (1 << i))); + } + + t = tmp & 0x7FC000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + t = tmp & 0x3000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + return tmp & ((1 << GFBITS) - 1); +} + +/* input: field element in */ +/* return: in^2 */ +static inline gf gf_sq(gf in) { + const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; + + uint32_t x = in; + uint32_t t; + + x = (x | (x << 8)) & B[3]; + x = (x | (x << 4)) & B[2]; + x = (x | (x << 2)) & B[1]; + x = (x | (x << 1)) & B[0]; + + t = x & 0x7FC000; + x ^= t >> 9; + x ^= t >> 12; + + t = x & 0x3000; + x ^= t >> 9; + x ^= t >> 12; + + return x & ((1 << GFBITS) - 1); +} + +gf PQCLEAN_MCELIECE348864_AVX_gf_inv(gf in) { + gf tmp_11; + gf tmp_1111; + + gf out = in; + + out = gf_sq(out); + tmp_11 = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, in); // 11 + + out = gf_sq(tmp_11); + out = gf_sq(out); + tmp_1111 = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, tmp_11); // 1111 + + out = gf_sq(tmp_1111); + out = gf_sq(out); + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, tmp_1111); // 11111111 + + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, tmp_11); // 1111111111 + + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, in); // 11111111111 + + return gf_sq(out); // 111111111110 +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE348864_AVX_gf_frac(gf den, gf num) { + return PQCLEAN_MCELIECE348864_AVX_gf_mul(PQCLEAN_MCELIECE348864_AVX_gf_inv(den), num); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE348864_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 877); + prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 2888); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 1781); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 373); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864_AVX_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x007FC000007FC000; + tmp ^= (t >> 9) ^ (t >> 12); + + t = tmp & 0x0000300000003000; + tmp ^= (t >> 9) ^ (t >> 12); + + return tmp & 0x00000FFF00000FFF; +} + diff --git a/crypto_kem/mceliece348864/avx/gf.h b/crypto_kem/mceliece348864/avx/gf.h new file mode 100644 index 00000000..3afe96ec --- /dev/null +++ b/crypto_kem/mceliece348864/avx/gf.h @@ -0,0 +1,26 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_GF_H +#define PQCLEAN_MCELIECE348864_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE348864_AVX_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE348864_AVX_gf_add(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864_AVX_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864_AVX_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE348864_AVX_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE348864_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864_AVX_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/int32_sort.c b/crypto_kem/mceliece348864/avx/int32_sort.c new file mode 100644 index 00000000..3412c64d --- /dev/null +++ b/crypto_kem/mceliece348864/avx/int32_sort.c @@ -0,0 +1,1208 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = (p << 1 == q); + flipflip = !flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE348864_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE348864_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/crypto_kem/mceliece348864/avx/int32_sort.h b/crypto_kem/mceliece348864/avx/int32_sort.h new file mode 100644 index 00000000..36a5034d --- /dev/null +++ b/crypto_kem/mceliece348864/avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE348864_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE348864_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece348864/avx/operations.c b/crypto_kem/mceliece348864/avx/operations.c new file mode 100644 index 00000000..cffd2849 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE348864_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE348864_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE348864_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE348864_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE348864_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE348864_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE348864_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE348864_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE348864_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE348864_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864/avx/params.h b/crypto_kem/mceliece348864/avx/params.h new file mode 100644 index 00000000..f45ee11e --- /dev/null +++ b/crypto_kem/mceliece348864/avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_PARAMS_H +#define PQCLEAN_MCELIECE348864_AVX_PARAMS_H + +#define GFBITS 12 +#define SYS_N 3488 +#define SYS_T 64 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece348864/avx/pk_gen.c b/crypto_kem/mceliece348864/avx/pk_gen.c new file mode 100644 index 00000000..81d46f69 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/pk_gen.c @@ -0,0 +1,276 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 16; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 16; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256) +int PQCLEAN_MCELIECE348864_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS1_I; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS1_I ]; + + uint64_t mask; + + uint64_t sk_int[ GFBITS ]; + + vec256 consts[ 16 ][ GFBITS ]; + vec256 eval[ 16 ][ GFBITS ]; + vec256 prod[ 16 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE348864_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE348864_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE348864_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 16; i++) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864_AVX_vec256_inv(tmp, prod[15]); + + for (i = 14; i >= 0; i--) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE348864_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_I; j++) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS1_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = 0; k < NBLOCKS1_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + PQCLEAN_MCELIECE348864_AVX_store8(pk, one_row[k]); + pk += 8; + } + + PQCLEAN_MCELIECE348864_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece348864/avx/pk_gen.h b/crypto_kem/mceliece348864/avx/pk_gen.h new file mode 100644 index 00000000..e3a61a0b --- /dev/null +++ b/crypto_kem/mceliece348864/avx/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE348864_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include "gf.h" + +int PQCLEAN_MCELIECE348864_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/powers.inc b/crypto_kem/mceliece348864/avx/powers.inc new file mode 100644 index 00000000..03e8349b --- /dev/null +++ b/crypto_kem/mceliece348864/avx/powers.inc @@ -0,0 +1,224 @@ +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, diff --git a/crypto_kem/mceliece348864/avx/scalars.inc b/crypto_kem/mceliece348864/avx/scalars.inc new file mode 100644 index 00000000..aa8f64b9 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/scalars.inc @@ -0,0 +1,70 @@ +{ + 0XF3CFC030FC30F003, + 0X3FCF0F003C00C00C, + 0X30033CC300C0C03C, + 0XCCFF0F3C0F30F0C0, + 0X0300C03FF303C3F0, + 0X3FFF3C0FF0CCCCC0, + 0XF3FFF0C00F3C3CC0, + 0X3003333FFFC3C000, + 0X0FF30FFFC3FFF300, + 0XFFC0F300F0F0CC00, + 0XC0CFF3FCCC3CFC00, + 0XFC3C03F0F330C000, +}, +{ + 0X000F00000000F00F, + 0X00000F00F00000F0, + 0X0F00000F00000F00, + 0XF00F00F00F000000, + 0X00F00000000000F0, + 0X0000000F00000000, + 0XF00000000F00F000, + 0X00F00F00000F0000, + 0X0000F00000F00F00, + 0X000F00F00F00F000, + 0X00F00F0000000000, + 0X0000000000F00000, +}, +{ + 0X0000FF00FF0000FF, + 0X0000FF000000FF00, + 0XFF0000FF00FF0000, + 0XFFFF0000FF000000, + 0X00FF00FF00FF0000, + 0X0000FFFFFF000000, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF0000, + 0XFFFF00FFFF00FF00, + 0X0000FF0000000000, + 0XFFFFFF00FF000000, + 0X00FF000000000000, +}, +{ + 0X000000000000FFFF, + 0X00000000FFFF0000, + 0X0000000000000000, + 0XFFFF000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +}, +{ + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +} diff --git a/crypto_kem/mceliece348864/avx/scalars_2x.inc b/crypto_kem/mceliece348864/avx/scalars_2x.inc new file mode 100644 index 00000000..604ec6b0 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/scalars_2x.inc @@ -0,0 +1,70 @@ +{ + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf3cfc030fc30f003, 0x000c03c0c3c0330c), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3fcf0f003c00c00c, 0xf330cffcc00f33c0), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x30033cc300c0c03c, 0xccf330f00f3c0333), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xccff0f3c0f30f0c0, 0xff03fff3ff0cf0c0), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0300c03ff303c3f0, 0x3cc3fcf00fcc303c), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3fff3c0ff0ccccc0, 0x0f000c0fc30303f3), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf3fff0c00f3c3cc0, 0xcf0fc3ff333ccf3c), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3003333fffc3c000, 0x003f3fc3c0ff333f), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0ff30fffc3fff300, 0x3cc3f0f3cf0ff00f), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffc0f300f0f0cc00, 0xf3f33cc03fc30cc0), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xc0cff3fccc3cfc00, 0x3cc330cfc333f33f), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xfc3c03f0f330c000, 0x3cc0303ff3c3fffc), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x000f00000000f00f, 0x0f00f00f00000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000f00f00000f0, 0xf00000000000f000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0f00000f00000f00, 0x00000f00000000f0), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf00f00f00f000000, 0x0f00f00000f00000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00f00000000000f0, 0x000f00000f00f00f), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000f00000000, 0x00f00f00f00f0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf00000000f00f000, 0x0f00f00000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00f00f00000f0000, 0x000000000f000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000f00000f00f00, 0x00f00000000f00f0), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x000f00f00f00f000, 0x0000f00f00000f00), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00f00f0000000000, 0xf00000f00000f00f), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000f00000, 0x00000f00f00f00f0), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ff00ff0000ff, 0xff00ffffff000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ff000000ff00, 0xff0000ffff000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xff0000ff00ff0000, 0xffff00ffff000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffff0000ff000000, 0xff00ffffffffff00), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00ff00ff00ff0000, 0x00000000ff00ff00), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ffffff000000, 0xffffffff00ff0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00ffff00ff000000, 0x00ffffff00ff0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffff0000ff0000, 0xffff00ffff00ffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffff00ffff00ff00, 0xffff0000ffffffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ff0000000000, 0xff00000000ff0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffff00ff000000, 0x000000ff00ff00ff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00ff000000000000, 0x00ff00ff00ffff00), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x000000000000ffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffff0000, 0xffff000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffff000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000ffff00000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ffff00000000, 0x00000000ffff0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x00000000ffff0000), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffffffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0x00000000ffffffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), +}, diff --git a/crypto_kem/mceliece348864/avx/sk_gen.c b/crypto_kem/mceliece348864/avx/sk_gen.c new file mode 100644 index 00000000..e30ed8a2 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE348864_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE348864_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE348864_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE348864_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE348864_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE348864_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE348864_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864/avx/sk_gen.h b/crypto_kem/mceliece348864/avx/sk_gen.h new file mode 100644 index 00000000..5baa544d --- /dev/null +++ b/crypto_kem/mceliece348864/avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE348864_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE348864_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE348864_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/syndrome_asm.S b/crypto_kem/mceliece348864/avx/syndrome_asm.S new file mode 100644 index 00000000..c77b629c --- /dev/null +++ b/crypto_kem/mceliece348864/avx/syndrome_asm.S @@ -0,0 +1,530 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_syndrome_asm +.global PQCLEAN_MCELIECE348864_AVX_syndrome_asm +_PQCLEAN_MCELIECE348864_AVX_syndrome_asm: +PQCLEAN_MCELIECE348864_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 260780 +# asm 1: add $260780,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 768 +# asm 1: mov $768,>row=int64#5 +# asm 2: mov $768,>row=%r8 +mov $768,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#3 +# asm 2: vmovupd 128(ee=%ymm2 +vmovupd 128(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#3 +# asm 2: vmovupd 160(ee=%ymm2 +vmovupd 160(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 192 ] +# asm 1: vmovupd 192(ee=reg256#3 +# asm 2: vmovupd 192(ee=%ymm2 +vmovupd 192(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 224 ] +# asm 1: vmovupd 224(ee=reg256#3 +# asm 2: vmovupd 224(ee=%ymm2 +vmovupd 224(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 256 ] +# asm 1: vmovupd 256(ee=reg256#3 +# asm 2: vmovupd 256(ee=%ymm2 +vmovupd 256(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 288 ] +# asm 1: vmovupd 288(ee=reg256#3 +# asm 2: vmovupd 288(ee=%ymm2 +vmovupd 288(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 320 ] +# asm 1: vmovupd 320(ee=reg256#3 +# asm 2: vmovupd 320(ee=%ymm2 +vmovupd 320(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 352 ] +# asm 1: vmovupd 352(ee=reg256#3 +# asm 2: vmovupd 352(ee=%ymm2 +vmovupd 352(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 384 ] +# asm 1: vmovupd 384(ee=reg256#3 +# asm 2: vmovupd 384(ee=%ymm2 +vmovupd 384(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = *(uint64 *)(input_1 + 320) +# asm 1: movq 320(s=int64#6 +# asm 2: movq 320(s=%r9 +movq 320(%rsi),%r9 + +# qhasm: e = *(uint64 *)(input_2 + 416) +# asm 1: movq 416(e=int64#7 +# asm 2: movq 416(e=%rax +movq 416(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 328(p=%rax +movq 328(%rsi),%rax + +# qhasm: e = *(uint64 *)(input_2 + 424) +# asm 1: movq 424(e=int64#8 +# asm 2: movq 424(e=%r10 +movq 424(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and p=int64#7d +# asm 2: movl 336(p=%eax +movl 336(%rsi),%eax + +# qhasm: e = *(uint32 *)(input_2 + 432) +# asm 1: movl 432(e=int64#8d +# asm 2: movl 432(e=%r10d +movl 432(%rdx),%r10d + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor + + +void PQCLEAN_MCELIECE348864_AVX_transpose_64x64(uint64_t *in); +void PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp(vec256 *in); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/transpose_64x256_sp_asm.S b/crypto_kem/mceliece348864/avx/transpose_64x256_sp_asm.S new file mode 100644 index 00000000..1e2dabd7 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/transpose_64x256_sp_asm.S @@ -0,0 +1,8145 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 x0 + +# qhasm: reg256 x1 + +# qhasm: reg256 x2 + +# qhasm: reg256 x3 + +# qhasm: reg256 x4 + +# qhasm: reg256 x5 + +# qhasm: reg256 x6 + +# qhasm: reg256 x7 + +# qhasm: reg256 t0 + +# qhasm: reg256 t1 + +# qhasm: reg256 v00 + +# qhasm: reg256 v01 + +# qhasm: reg256 v10 + +# qhasm: reg256 v11 + +# qhasm: reg256 mask0 + +# qhasm: reg256 mask1 + +# qhasm: reg256 mask2 + +# qhasm: reg256 mask3 + +# qhasm: reg256 mask4 + +# qhasm: reg256 mask5 + +# qhasm: enter transpose_64x256_sp_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp_asm +.global PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp_asm +_PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp_asm: +PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: mask0 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK5_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK5_0,>mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 64 ] x2 +# asm 1: movddup 64(r1=reg128#8 +# asm 2: movddup 64(r1=%xmm7 +movddup 64(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 128 ] x2 +# asm 1: movddup 128(r2=reg128#9 +# asm 2: movddup 128(r2=%xmm8 +movddup 128(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 192 ] x2 +# asm 1: movddup 192(r3=reg128#10 +# asm 2: movddup 192(r3=%xmm9 +movddup 192(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 256 ] x2 +# asm 1: movddup 256(r4=reg128#11 +# asm 2: movddup 256(r4=%xmm10 +movddup 256(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 320 ] x2 +# asm 1: movddup 320(r5=reg128#12 +# asm 2: movddup 320(r5=%xmm11 +movddup 320(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 384 ] x2 +# asm 1: movddup 384(r6=reg128#13 +# asm 2: movddup 384(r6=%xmm12 +movddup 384(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 448 ] x2 +# asm 1: movddup 448(r7=reg128#14 +# asm 2: movddup 448(r7=%xmm13 +movddup 448(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 0 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 64 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 128 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 192 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 256 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 320 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 384 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 448 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 8(r0=%xmm6 +movddup 8(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r2=reg128#9 +# asm 2: movddup 136(r2=%xmm8 +movddup 136(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r3=reg128#10 +# asm 2: movddup 200(r3=%xmm9 +movddup 200(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r4=reg128#11 +# asm 2: movddup 264(r4=%xmm10 +movddup 264(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r5=reg128#12 +# asm 2: movddup 328(r5=%xmm11 +movddup 328(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r6=reg128#13 +# asm 2: movddup 392(r6=%xmm12 +movddup 392(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r7=reg128#14 +# asm 2: movddup 456(r7=%xmm13 +movddup 456(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 8 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 72 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 136 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 200 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 264 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 328 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 392 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 456 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 16(r0=%xmm6 +movddup 16(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r1=reg128#8 +# asm 2: movddup 80(r1=%xmm7 +movddup 80(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r3=reg128#10 +# asm 2: movddup 208(r3=%xmm9 +movddup 208(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r4=reg128#11 +# asm 2: movddup 272(r4=%xmm10 +movddup 272(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r5=reg128#12 +# asm 2: movddup 336(r5=%xmm11 +movddup 336(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r6=reg128#13 +# asm 2: movddup 400(r6=%xmm12 +movddup 400(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r7=reg128#14 +# asm 2: movddup 464(r7=%xmm13 +movddup 464(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 16 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 80 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 144 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 208 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 272 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 336 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 400 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 464 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 24(r0=%xmm6 +movddup 24(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r1=reg128#8 +# asm 2: movddup 88(r1=%xmm7 +movddup 88(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r2=reg128#9 +# asm 2: movddup 152(r2=%xmm8 +movddup 152(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r4=reg128#11 +# asm 2: movddup 280(r4=%xmm10 +movddup 280(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r5=reg128#12 +# asm 2: movddup 344(r5=%xmm11 +movddup 344(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r6=reg128#13 +# asm 2: movddup 408(r6=%xmm12 +movddup 408(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r7=reg128#14 +# asm 2: movddup 472(r7=%xmm13 +movddup 472(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 24 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 88 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 152 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 216 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 280 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 344 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 408 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 472 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 32(r0=%xmm6 +movddup 32(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r1=reg128#8 +# asm 2: movddup 96(r1=%xmm7 +movddup 96(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r2=reg128#9 +# asm 2: movddup 160(r2=%xmm8 +movddup 160(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r3=reg128#10 +# asm 2: movddup 224(r3=%xmm9 +movddup 224(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r5=reg128#12 +# asm 2: movddup 352(r5=%xmm11 +movddup 352(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r6=reg128#13 +# asm 2: movddup 416(r6=%xmm12 +movddup 416(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r7=reg128#14 +# asm 2: movddup 480(r7=%xmm13 +movddup 480(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 32 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 96 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 160 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 224 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 288 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 352 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 416 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 480 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 40(r0=%xmm6 +movddup 40(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r1=reg128#8 +# asm 2: movddup 104(r1=%xmm7 +movddup 104(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r2=reg128#9 +# asm 2: movddup 168(r2=%xmm8 +movddup 168(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r3=reg128#10 +# asm 2: movddup 232(r3=%xmm9 +movddup 232(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r4=reg128#11 +# asm 2: movddup 296(r4=%xmm10 +movddup 296(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r6=reg128#13 +# asm 2: movddup 424(r6=%xmm12 +movddup 424(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r7=reg128#14 +# asm 2: movddup 488(r7=%xmm13 +movddup 488(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 40 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 104 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 168 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 232 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 296 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 360 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 424 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 488 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 48(r0=%xmm6 +movddup 48(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r1=reg128#8 +# asm 2: movddup 112(r1=%xmm7 +movddup 112(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r2=reg128#9 +# asm 2: movddup 176(r2=%xmm8 +movddup 176(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r3=reg128#10 +# asm 2: movddup 240(r3=%xmm9 +movddup 240(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r4=reg128#11 +# asm 2: movddup 304(r4=%xmm10 +movddup 304(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r5=reg128#12 +# asm 2: movddup 368(r5=%xmm11 +movddup 368(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r7=reg128#14 +# asm 2: movddup 496(r7=%xmm13 +movddup 496(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 48 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 112 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 176 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 240 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 304 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 368 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 432 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 496 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 56(r0=%xmm6 +movddup 56(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r1=reg128#8 +# asm 2: movddup 120(r1=%xmm7 +movddup 120(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r2=reg128#9 +# asm 2: movddup 184(r2=%xmm8 +movddup 184(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r3=reg128#10 +# asm 2: movddup 248(r3=%xmm9 +movddup 248(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r4=reg128#11 +# asm 2: movddup 312(r4=%xmm10 +movddup 312(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r5=reg128#12 +# asm 2: movddup 376(r5=%xmm11 +movddup 376(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r6=reg128#13 +# asm 2: movddup 440(r6=%xmm12 +movddup 440(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm3,%rsi + +# qhasm: mem64[ input_0 + 56 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm7,%rsi + +# qhasm: mem64[ input_0 + 120 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 184 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm0,%rsi + +# qhasm: mem64[ input_0 + 248 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 312 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm2,%rsi + +# qhasm: mem64[ input_0 + 376 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm4,%rsi + +# qhasm: mem64[ input_0 + 440 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm1,%rsi + +# qhasm: mem64[ input_0 + 504 ] = buf +# asm 1: movq mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 8 ] x2 +# asm 1: movddup 8(r1=reg128#8 +# asm 2: movddup 8(r1=%xmm7 +movddup 8(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 16 ] x2 +# asm 1: movddup 16(r2=reg128#9 +# asm 2: movddup 16(r2=%xmm8 +movddup 16(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 24 ] x2 +# asm 1: movddup 24(r3=reg128#10 +# asm 2: movddup 24(r3=%xmm9 +movddup 24(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 32 ] x2 +# asm 1: movddup 32(r4=reg128#11 +# asm 2: movddup 32(r4=%xmm10 +movddup 32(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 40 ] x2 +# asm 1: movddup 40(r5=reg128#12 +# asm 2: movddup 40(r5=%xmm11 +movddup 40(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 48 ] x2 +# asm 1: movddup 48(r6=reg128#13 +# asm 2: movddup 48(r6=%xmm12 +movddup 48(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 56 ] x2 +# asm 1: movddup 56(r7=reg128#14 +# asm 2: movddup 56(r7=%xmm13 +movddup 56(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 0 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 16 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 32 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 64(r0=%xmm6 +movddup 64(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r2=reg128#9 +# asm 2: movddup 80(r2=%xmm8 +movddup 80(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r3=reg128#10 +# asm 2: movddup 88(r3=%xmm9 +movddup 88(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r4=reg128#11 +# asm 2: movddup 96(r4=%xmm10 +movddup 96(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r5=reg128#12 +# asm 2: movddup 104(r5=%xmm11 +movddup 104(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r6=reg128#13 +# asm 2: movddup 112(r6=%xmm12 +movddup 112(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r7=reg128#14 +# asm 2: movddup 120(r7=%xmm13 +movddup 120(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 64 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 80 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 96 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 112 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 128(r0=%xmm6 +movddup 128(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r1=reg128#8 +# asm 2: movddup 136(r1=%xmm7 +movddup 136(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r3=reg128#10 +# asm 2: movddup 152(r3=%xmm9 +movddup 152(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r4=reg128#11 +# asm 2: movddup 160(r4=%xmm10 +movddup 160(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r5=reg128#12 +# asm 2: movddup 168(r5=%xmm11 +movddup 168(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r6=reg128#13 +# asm 2: movddup 176(r6=%xmm12 +movddup 176(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r7=reg128#14 +# asm 2: movddup 184(r7=%xmm13 +movddup 184(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 128 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 144 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 160 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 176 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 192(r0=%xmm6 +movddup 192(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r1=reg128#8 +# asm 2: movddup 200(r1=%xmm7 +movddup 200(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r2=reg128#9 +# asm 2: movddup 208(r2=%xmm8 +movddup 208(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r4=reg128#11 +# asm 2: movddup 224(r4=%xmm10 +movddup 224(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r5=reg128#12 +# asm 2: movddup 232(r5=%xmm11 +movddup 232(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r6=reg128#13 +# asm 2: movddup 240(r6=%xmm12 +movddup 240(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r7=reg128#14 +# asm 2: movddup 248(r7=%xmm13 +movddup 248(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 192 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 208 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 224 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 240 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 256(r0=%xmm6 +movddup 256(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r1=reg128#8 +# asm 2: movddup 264(r1=%xmm7 +movddup 264(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r2=reg128#9 +# asm 2: movddup 272(r2=%xmm8 +movddup 272(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r3=reg128#10 +# asm 2: movddup 280(r3=%xmm9 +movddup 280(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r5=reg128#12 +# asm 2: movddup 296(r5=%xmm11 +movddup 296(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r6=reg128#13 +# asm 2: movddup 304(r6=%xmm12 +movddup 304(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r7=reg128#14 +# asm 2: movddup 312(r7=%xmm13 +movddup 312(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 256 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 272 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 288 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 304 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 320(r0=%xmm6 +movddup 320(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r1=reg128#8 +# asm 2: movddup 328(r1=%xmm7 +movddup 328(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r2=reg128#9 +# asm 2: movddup 336(r2=%xmm8 +movddup 336(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r3=reg128#10 +# asm 2: movddup 344(r3=%xmm9 +movddup 344(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r4=reg128#11 +# asm 2: movddup 352(r4=%xmm10 +movddup 352(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r6=reg128#13 +# asm 2: movddup 368(r6=%xmm12 +movddup 368(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r7=reg128#14 +# asm 2: movddup 376(r7=%xmm13 +movddup 376(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 320 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 336 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 352 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 368 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 384(r0=%xmm6 +movddup 384(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r1=reg128#8 +# asm 2: movddup 392(r1=%xmm7 +movddup 392(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r2=reg128#9 +# asm 2: movddup 400(r2=%xmm8 +movddup 400(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r3=reg128#10 +# asm 2: movddup 408(r3=%xmm9 +movddup 408(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r4=reg128#11 +# asm 2: movddup 416(r4=%xmm10 +movddup 416(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r5=reg128#12 +# asm 2: movddup 424(r5=%xmm11 +movddup 424(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r7=reg128#14 +# asm 2: movddup 440(r7=%xmm13 +movddup 440(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 384 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 400 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 416 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 432 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 448(r0=%xmm6 +movddup 448(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r1=reg128#8 +# asm 2: movddup 456(r1=%xmm7 +movddup 456(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r2=reg128#9 +# asm 2: movddup 464(r2=%xmm8 +movddup 464(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r3=reg128#10 +# asm 2: movddup 472(r3=%xmm9 +movddup 472(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r4=reg128#11 +# asm 2: movddup 480(r4=%xmm10 +movddup 480(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r5=reg128#12 +# asm 2: movddup 488(r5=%xmm11 +movddup 488(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r6=reg128#13 +# asm 2: movddup 496(r6=%xmm12 +movddup 496(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#4 +# asm 2: vpunpcklqdq t0=%xmm3 +vpunpcklqdq %xmm7,%xmm3,%xmm3 + +# qhasm: mem128[ input_0 + 448 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm0,%xmm8,%xmm0 + +# qhasm: mem128[ input_0 + 464 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm2,%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 480 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm1,%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 496 ] = t0 +# asm 1: movdqu +#include + +void PQCLEAN_MCELIECE348864_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece348864/avx/update_asm.S b/crypto_kem/mceliece348864/avx/update_asm.S new file mode 100644 index 00000000..cecfdbcb --- /dev/null +++ b/crypto_kem/mceliece348864/avx/update_asm.S @@ -0,0 +1,354 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_update_asm +.global PQCLEAN_MCELIECE348864_AVX_update_asm +_PQCLEAN_MCELIECE348864_AVX_update_asm: +PQCLEAN_MCELIECE348864_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s1 = input_1 +# asm 1: mov s1=int64#2 +# asm 2: mov s1=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864_AVX_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE348864_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE348864_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE348864_AVX_irr_load(uint64_t *out, const unsigned char *in) { + int i, j; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE348864_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + out[i] = 0; + } + + for (i = SYS_T; i >= 0; i--) { + for (j = 0; j < GFBITS; j++) { + out[j] <<= 1; + out[j] |= (irr[i] >> j) & 1; + } + } +} + +void PQCLEAN_MCELIECE348864_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE348864_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE348864_AVX_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 4; +} + +vec128 PQCLEAN_MCELIECE348864_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE348864_AVX_vec128_set2x( PQCLEAN_MCELIECE348864_AVX_load8(in), PQCLEAN_MCELIECE348864_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE348864_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE348864_AVX_store8(out + 0, PQCLEAN_MCELIECE348864_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE348864_AVX_store8(out + 8, PQCLEAN_MCELIECE348864_AVX_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece348864/avx/util.h b/crypto_kem/mceliece348864/avx/util.h new file mode 100644 index 00000000..1367edf4 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/util.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_UTIL_H +#define PQCLEAN_MCELIECE348864_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE348864_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE348864_AVX_store2(unsigned char *dest, gf a); + +uint16_t PQCLEAN_MCELIECE348864_AVX_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE348864_AVX_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE348864_AVX_irr_load(uint64_t *out, const unsigned char *in); + +void PQCLEAN_MCELIECE348864_AVX_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE348864_AVX_load8(const unsigned char *in); + +gf PQCLEAN_MCELIECE348864_AVX_bitrev(gf a); + +vec128 PQCLEAN_MCELIECE348864_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE348864_AVX_store16(unsigned char *out, vec128 in); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/vec.c b/crypto_kem/mceliece348864/avx/vec.c new file mode 100644 index 00000000..6836a151 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/vec.c @@ -0,0 +1,25 @@ +#include "vec.h" + +#include "params.h" + +extern void PQCLEAN_MCELIECE348864_AVX_vec_mul_asm(uint64_t *, const uint64_t *, const uint64_t *); +extern void PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm(uint64_t *, const uint64_t *, const uint64_t *); + + +void PQCLEAN_MCELIECE348864_AVX_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g) { + PQCLEAN_MCELIECE348864_AVX_vec_mul_asm(h, f, g); +} + + +void PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(uint64_t *h, const uint64_t *f, const uint64_t *g) { + PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm(h, f, g); +} + +void PQCLEAN_MCELIECE348864_AVX_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g) { + int b; + + for (b = 0; b < GFBITS; b++) { + h[b] = f[b] ^ g[b]; + } +} + diff --git a/crypto_kem/mceliece348864/avx/vec.h b/crypto_kem/mceliece348864/avx/vec.h new file mode 100644 index 00000000..cbe6beb6 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/vec.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_VEC_H +#define PQCLEAN_MCELIECE348864_AVX_VEC_H + +#include + + +void PQCLEAN_MCELIECE348864_AVX_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g); + +void PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(uint64_t *h, const uint64_t *f, const uint64_t *g); + +void PQCLEAN_MCELIECE348864_AVX_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g); + +#endif diff --git a/crypto_kem/mceliece348864/avx/vec128.c b/crypto_kem/mceliece348864/avx/vec128.c new file mode 100644 index 00000000..e40f2897 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/vec128.c @@ -0,0 +1,83 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE348864_AVX_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE348864_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE348864_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE348864_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE348864_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/crypto_kem/mceliece348864/avx/vec128.h b/crypto_kem/mceliece348864/avx/vec128.h new file mode 100644 index 00000000..69c9be1c --- /dev/null +++ b/crypto_kem/mceliece348864/avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_VEC128_H +#define PQCLEAN_MCELIECE348864_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE348864_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE348864_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE348864_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE348864_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/crypto_kem/mceliece348864/avx/vec128_mul_asm.S b/crypto_kem/mceliece348864/avx/vec128_mul_asm.S new file mode 100644 index 00000000..de29ef1c --- /dev/null +++ b/crypto_kem/mceliece348864/avx/vec128_mul_asm.S @@ -0,0 +1,1369 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#4 +# asm 2: leaq ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: b11 = mem128[ input_2 + 176 ] x2 +# asm 1: vbroadcasti128 176(b11=reg256#1 +# asm 2: vbroadcasti128 176(b11=%ymm0 +vbroadcasti128 176(%rdx), %ymm0 + +# qhasm: a5[0] = mem128[ input_1 + 80 ] +# asm 1: vinsertf128 $0x0,80(r16=reg256#3 +# asm 2: vpand r16=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 512 ] = r16 +# asm 1: vmovupd r15=reg256#4 +# asm 2: vpand r15=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r14=reg256#6 +# asm 2: vpand r14=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r13=reg256#8 +# asm 2: vpand r13=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r12=reg256#10 +# asm 2: vpand r12=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r11=reg256#1 +# asm 2: vpand r11=%ymm0 +vpand %ymm0,%ymm10,%ymm0 + +# qhasm: b10 = mem128[ input_2 + 160 ] x2 +# asm 1: vbroadcasti128 160(b10=reg256#12 +# asm 2: vbroadcasti128 160(b10=%ymm11 +vbroadcasti128 160(%rdx), %ymm11 + +# qhasm: r = b10 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm2,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm4,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm6,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm8,%ymm3 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#4 +# asm 2: vpand r10=%ymm3 +vpand %ymm11,%ymm10,%ymm3 + +# qhasm: b9 = mem128[ input_2 + 144 ] x2 +# asm 1: vbroadcasti128 144(b9=reg256#12 +# asm 2: vbroadcasti128 144(b9=%ymm11 +vbroadcasti128 144(%rdx), %ymm11 + +# qhasm: r = b9 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm2,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm4,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm6,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm8,%ymm5 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#6 +# asm 2: vpand r9=%ymm5 +vpand %ymm11,%ymm10,%ymm5 + +# qhasm: b8 = mem128[ input_2 + 128 ] x2 +# asm 1: vbroadcasti128 128(b8=reg256#12 +# asm 2: vbroadcasti128 128(b8=%ymm11 +vbroadcasti128 128(%rdx), %ymm11 + +# qhasm: r = b8 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm2,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm4,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm6,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm8,%ymm7 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#8 +# asm 2: vpand r8=%ymm7 +vpand %ymm11,%ymm10,%ymm7 + +# qhasm: b7 = mem128[ input_2 + 112 ] x2 +# asm 1: vbroadcasti128 112(b7=reg256#12 +# asm 2: vbroadcasti128 112(b7=%ymm11 +vbroadcasti128 112(%rdx), %ymm11 + +# qhasm: r = b7 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm2,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm4,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm6,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm8,%ymm9 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#10 +# asm 2: vpand r7=%ymm9 +vpand %ymm11,%ymm10,%ymm9 + +# qhasm: b6 = mem128[ input_2 + 96 ] x2 +# asm 1: vbroadcasti128 96(b6=reg256#12 +# asm 2: vbroadcasti128 96(b6=%ymm11 +vbroadcasti128 96(%rdx), %ymm11 + +# qhasm: r = b6 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm2,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm4,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm6,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm8,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm11,%ymm10,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 80 ] x2 +# asm 1: vbroadcasti128 80(b5=reg256#12 +# asm 2: vbroadcasti128 80(b5=%ymm11 +vbroadcasti128 80(%rdx), %ymm11 + +# qhasm: r = b5 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm2,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm4,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm6,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm8,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm11,%ymm10,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 64 ] x2 +# asm 1: vbroadcasti128 64(b4=reg256#12 +# asm 2: vbroadcasti128 64(b4=%ymm11 +vbroadcasti128 64(%rdx), %ymm11 + +# qhasm: r = b4 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm2,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm4,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm6,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm8,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm11,%ymm10,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 48 ] x2 +# asm 1: vbroadcasti128 48(b3=reg256#12 +# asm 2: vbroadcasti128 48(b3=%ymm11 +vbroadcasti128 48(%rdx), %ymm11 + +# qhasm: r = b3 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm2,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm4,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm6,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm8,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm11,%ymm10,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 32 ] x2 +# asm 1: vbroadcasti128 32(b2=reg256#12 +# asm 2: vbroadcasti128 32(b2=%ymm11 +vbroadcasti128 32(%rdx), %ymm11 + +# qhasm: r = b2 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm2,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm4,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm6,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm8,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm11,%ymm10,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 16 ] x2 +# asm 1: vbroadcasti128 16(b1=reg256#12 +# asm 2: vbroadcasti128 16(b1=%ymm11 +vbroadcasti128 16(%rdx), %ymm11 + +# qhasm: r = b1 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#1 +# asm 2: vpand r1=%ymm0 +vpand %ymm11,%ymm10,%ymm0 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#12 +# asm 2: vbroadcasti128 0(b0=%ymm11 +vbroadcasti128 0(%rdx), %ymm11 + +# qhasm: r = b0 & a5 +# asm 1: vpand r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm1,%ymm1 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm2,%ymm1 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm4,%ymm1 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm6,%ymm1 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm8,%ymm1 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#2 +# asm 2: vpand r0=%ymm1 +vpand %ymm11,%ymm10,%ymm1 + +# qhasm: mem256[ ptr + 128 ] = r4 +# asm 1: vmovupd h22=reg128#1 +# asm 2: movdqu 528(h22=%xmm0 +movdqu 528(%rcx),%xmm0 + +# qhasm: h13 = h22 +# asm 1: movdqa h13=reg128#2 +# asm 2: movdqa h13=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h10 = h22 +# asm 1: movdqa h10=reg128#1 +# asm 2: movdqa h10=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h21 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h21=reg128#3 +# asm 2: movdqu 496(h21=%xmm2 +movdqu 496(%rcx),%xmm2 + +# qhasm: h12 = h21 +# asm 1: movdqa h12=reg128#4 +# asm 2: movdqa h12=%xmm3 +movdqa %xmm2,%xmm3 + +# qhasm: h9 = h21 +# asm 1: movdqa h9=reg128#3 +# asm 2: movdqa h9=%xmm2 +movdqa %xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h20=reg128#5 +# asm 2: movdqu 464(h20=%xmm4 +movdqu 464(%rcx),%xmm4 + +# qhasm: h11 = h20 +# asm 1: movdqa h11=reg128#6 +# asm 2: movdqa h11=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h8 = h20 +# asm 1: movdqa h8=reg128#5 +# asm 2: movdqa h8=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: h19 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h19=reg128#7 +# asm 2: movdqu 432(h19=%xmm6 +movdqu 432(%rcx),%xmm6 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#1 +# asm 2: vpxor h10=%xmm0 +vpxor %xmm6,%xmm0,%xmm0 + +# qhasm: h7 = h19 +# asm 1: movdqa h7=reg128#7 +# asm 2: movdqa h7=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: h18 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h18=reg128#8 +# asm 2: movdqu 400(h18=%xmm7 +movdqu 400(%rcx),%xmm7 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#3 +# asm 2: vpxor h9=%xmm2 +vpxor %xmm7,%xmm2,%xmm2 + +# qhasm: h6 = h18 +# asm 1: movdqa h6=reg128#8 +# asm 2: movdqa h6=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: h17 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h17=reg128#9 +# asm 2: movdqu 368(h17=%xmm8 +movdqu 368(%rcx),%xmm8 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#5 +# asm 2: vpxor h8=%xmm4 +vpxor %xmm8,%xmm4,%xmm4 + +# qhasm: h5 = h17 +# asm 1: movdqa h5=reg128#9 +# asm 2: movdqa h5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: h16 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h16=reg128#10 +# asm 2: movdqu 336(h16=%xmm9 +movdqu 336(%rcx),%xmm9 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#10 +# asm 2: vpxor 512(h16=%xmm9 +vpxor 512(%rcx),%xmm9,%xmm9 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#7 +# asm 2: vpxor h7=%xmm6 +vpxor %xmm9,%xmm6,%xmm6 + +# qhasm: h4 = h16 +# asm 1: movdqa h4=reg128#10 +# asm 2: movdqa h4=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: h15 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h15=reg128#11 +# asm 2: movdqu 304(h15=%xmm10 +movdqu 304(%rcx),%xmm10 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#11 +# asm 2: vpxor 480(h15=%xmm10 +vpxor 480(%rcx),%xmm10,%xmm10 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#8 +# asm 2: vpxor h6=%xmm7 +vpxor %xmm10,%xmm7,%xmm7 + +# qhasm: h3 = h15 +# asm 1: movdqa h3=reg128#11 +# asm 2: movdqa h3=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: h14 = mem128[ ptr + 272 ] +# asm 1: movdqu 272(h14=reg128#12 +# asm 2: movdqu 272(h14=%xmm11 +movdqu 272(%rcx),%xmm11 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#12 +# asm 2: vpxor 448(h14=%xmm11 +vpxor 448(%rcx),%xmm11,%xmm11 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#9 +# asm 2: vpxor h5=%xmm8 +vpxor %xmm11,%xmm8,%xmm8 + +# qhasm: h2 = h14 +# asm 1: movdqa h2=reg128#12 +# asm 2: movdqa h2=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: h13 = h13 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h13=reg128#2 +# asm 2: vpxor 240(h13=%xmm1 +vpxor 240(%rcx),%xmm1,%xmm1 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#2 +# asm 2: vpxor 416(h13=%xmm1 +vpxor 416(%rcx),%xmm1,%xmm1 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#10 +# asm 2: vpxor h4=%xmm9 +vpxor %xmm1,%xmm9,%xmm9 + +# qhasm: h1 = h13 +# asm 1: movdqa h1=reg128#2 +# asm 2: movdqa h1=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: h12 = h12 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h12=reg128#4 +# asm 2: vpxor 208(h12=%xmm3 +vpxor 208(%rcx),%xmm3,%xmm3 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#4 +# asm 2: vpxor 384(h12=%xmm3 +vpxor 384(%rcx),%xmm3,%xmm3 + +# qhasm: h3 = h3 ^ h12 +# asm 1: vpxor h3=reg128#11 +# asm 2: vpxor h3=%xmm10 +vpxor %xmm3,%xmm10,%xmm10 + +# qhasm: h0 = h12 +# asm 1: movdqa h0=reg128#4 +# asm 2: movdqa h0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: h11 = h11 ^ mem128[ ptr + 352 ] +# asm 1: vpxor 352(h11=reg128#6 +# asm 2: vpxor 352(h11=%xmm5 +vpxor 352(%rcx),%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h11=reg128#6 +# asm 2: vpxor 176(h11=%xmm5 +vpxor 176(%rcx),%xmm5,%xmm5 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#1 +# asm 2: vpxor 320(h10=%xmm0 +vpxor 320(%rcx),%xmm0,%xmm0 + +# qhasm: h10 = h10 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h10=reg128#1 +# asm 2: vpxor 144(h10=%xmm0 +vpxor 144(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#1 +# asm 2: vpxor 288(h9=%xmm0 +vpxor 288(%rcx),%xmm2,%xmm0 + +# qhasm: h9 = h9 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h9=reg128#1 +# asm 2: vpxor 112(h9=%xmm0 +vpxor 112(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#1 +# asm 2: vpxor 256(h8=%xmm0 +vpxor 256(%rcx),%xmm4,%xmm0 + +# qhasm: h8 = h8 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h8=reg128#1 +# asm 2: vpxor 80(h8=%xmm0 +vpxor 80(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#1 +# asm 2: vpxor 224(h7=%xmm0 +vpxor 224(%rcx),%xmm6,%xmm0 + +# qhasm: h7 = h7 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h7=reg128#1 +# asm 2: vpxor 48(h7=%xmm0 +vpxor 48(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%rcx),%xmm7,%xmm0 + +# qhasm: h6 = h6 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h6=reg128#1 +# asm 2: vpxor 16(h6=%xmm0 +vpxor 16(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%rcx),%xmm8,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%rcx),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%rcx),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%rcx),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%rcx),%xmm1,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%rcx),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE348864_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE348864_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE348864_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE348864_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE348864_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE348864_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE348864_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE348864_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/crypto_kem/mceliece348864/avx/vec256_mul_asm.S b/crypto_kem/mceliece348864/avx/vec256_mul_asm.S new file mode 100644 index 00000000..5df2bcd7 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/vec256_mul_asm.S @@ -0,0 +1,1736 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r + +# qhasm: enter vec256_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_vec256_mul_asm +.global PQCLEAN_MCELIECE348864_AVX_vec256_mul_asm +_PQCLEAN_MCELIECE348864_AVX_vec256_mul_asm: +PQCLEAN_MCELIECE348864_AVX_vec256_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#2 +# asm 2: vmovupd 352(a11=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: r11 = a11 & b0 +# asm 1: vpand r11=reg256#3 +# asm 2: vpand r11=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r12 = a11 & mem256[input_2 + 32] +# asm 1: vpand 32(r12=reg256#4 +# asm 2: vpand 32(r12=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r13 = a11 & mem256[input_2 + 64] +# asm 1: vpand 64(r13=reg256#5 +# asm 2: vpand 64(r13=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r14 = a11 & mem256[input_2 + 96] +# asm 1: vpand 96(r14=reg256#6 +# asm 2: vpand 96(r14=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r15 = a11 & mem256[input_2 + 128] +# asm 1: vpand 128(r15=reg256#7 +# asm 2: vpand 128(r15=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r16 = a11 & mem256[input_2 + 160] +# asm 1: vpand 160(r16=reg256#8 +# asm 2: vpand 160(r16=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r17 = a11 & mem256[input_2 + 192] +# asm 1: vpand 192(r17=reg256#9 +# asm 2: vpand 192(r17=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r18 = a11 & mem256[input_2 + 224] +# asm 1: vpand 224(r18=reg256#10 +# asm 2: vpand 224(r18=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r19 = a11 & mem256[input_2 + 256] +# asm 1: vpand 256(r19=reg256#11 +# asm 2: vpand 256(r19=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r20 = a11 & mem256[input_2 + 288] +# asm 1: vpand 288(r20=reg256#12 +# asm 2: vpand 288(r20=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r21 = a11 & mem256[input_2 + 320] +# asm 1: vpand 320(r21=reg256#13 +# asm 2: vpand 320(r21=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r22 = a11 & mem256[input_2 + 352] +# asm 1: vpand 352(r22=reg256#2 +# asm 2: vpand 352(r22=%ymm1 +vpand 352(%rdx),%ymm1,%ymm1 + +# qhasm: r13 ^= r22 +# asm 1: vpxor r10=reg256#2 +# asm 2: vmovapd r10=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#14 +# asm 2: vmovupd 320(a10=%ymm13 +vmovupd 320(%rsi),%ymm13 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r21 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#14 +# asm 2: vmovupd 288(a9=%ymm13 +vmovupd 288(%rsi),%ymm13 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r20 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#14 +# asm 2: vmovupd 256(a8=%ymm13 +vmovupd 256(%rsi),%ymm13 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r19 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#14 +# asm 2: vmovupd 224(a7=%ymm13 +vmovupd 224(%rsi),%ymm13 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r18 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#14 +# asm 2: vmovupd 192(a6=%ymm13 +vmovupd 192(%rsi),%ymm13 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r17 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#14 +# asm 2: vmovupd 160(a5=%ymm13 +vmovupd 160(%rsi),%ymm13 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r16 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#14 +# asm 2: vmovupd 128(a4=%ymm13 +vmovupd 128(%rsi),%ymm13 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r15 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#14 +# asm 2: vmovupd 96(a3=%ymm13 +vmovupd 96(%rsi),%ymm13 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r14 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#14 +# asm 2: vmovupd 64(a2=%ymm13 +vmovupd 64(%rsi),%ymm13 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r13 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#14 +# asm 2: vmovupd 32(a1=%ymm13 +vmovupd 32(%rsi),%ymm13 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r12 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#14 +# asm 2: vmovupd 0(a0=%ymm13 +vmovupd 0(%rsi),%ymm13 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm13,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm13,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm13,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm13,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm13,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm13,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm13,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm13,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm13,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm13,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm13,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r11_stack=stack64#1 +# asm 2: movq r11_stack=608(%rsp) +movq %r11,608(%rsp) + +# qhasm: r12_stack = caller_r12 +# asm 1: movq r12_stack=stack64#2 +# asm 2: movq r12_stack=616(%rsp) +movq %r12,616(%rsp) + +# qhasm: r13_stack = caller_r13 +# asm 1: movq r13_stack=stack64#3 +# asm 2: movq r13_stack=624(%rsp) +movq %r13,624(%rsp) + +# qhasm: r14_stack = caller_r14 +# asm 1: movq r14_stack=stack64#4 +# asm 2: movq r14_stack=632(%rsp) +movq %r14,632(%rsp) + +# qhasm: r15_stack = caller_r15 +# asm 1: movq r15_stack=stack64#5 +# asm 2: movq r15_stack=640(%rsp) +movq %r15,640(%rsp) + +# qhasm: rbx_stack = caller_rbx +# asm 1: movq rbx_stack=stack64#6 +# asm 2: movq rbx_stack=648(%rsp) +movq %rbx,648(%rsp) + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#4 +# asm 2: leaq ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: s0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(s0=reg256#1 +# asm 2: vmovupd 0(s0=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: s1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(s1=reg256#2 +# asm 2: vmovupd 32(s1=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: s2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(s2=reg256#3 +# asm 2: vmovupd 64(s2=%ymm2 +vmovupd 64(%rsi),%ymm2 + +# qhasm: t0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(t0=reg256#4 +# asm 2: vmovupd 0(t0=%ymm3 +vmovupd 0(%rdx),%ymm3 + +# qhasm: t1 = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(t1=reg256#5 +# asm 2: vmovupd 32(t1=%ymm4 +vmovupd 32(%rdx),%ymm4 + +# qhasm: t2 = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(t2=reg256#6 +# asm 2: vmovupd 64(t2=%ymm5 +vmovupd 64(%rdx),%ymm5 + +# qhasm: a5[0,1,2,3] = s2[2,2,3,3] +# asm 1: vpermq $0xfa,a5=reg256#7 +# asm 2: vpermq $0xfa,a5=%ymm6 +vpermq $0xfa,%ymm2,%ymm6 + +# qhasm: b5[0,1,2,3] = t2[2,3,2,3] +# asm 1: vpermq $0xee,b5=reg256#8 +# asm 2: vpermq $0xee,b5=%ymm7 +vpermq $0xee,%ymm5,%ymm7 + +# qhasm: r10 = a5 & b5 +# asm 1: vpand r10=reg256#9 +# asm 2: vpand r10=%ymm8 +vpand %ymm6,%ymm7,%ymm8 + +# qhasm: mem256[ ptr + 320 ] = r10 +# asm 1: vmovupd b4=reg256#6 +# asm 2: vpermq $0x44,b4=%ymm5 +vpermq $0x44,%ymm5,%ymm5 + +# qhasm: r9 = a5 & b4 +# asm 1: vpand r9=reg256#9 +# asm 2: vpand r9=%ymm8 +vpand %ymm6,%ymm5,%ymm8 + +# qhasm: b3[0,1,2,3] = t1[2,3,2,3] +# asm 1: vpermq $0xee,b3=reg256#10 +# asm 2: vpermq $0xee,b3=%ymm9 +vpermq $0xee,%ymm4,%ymm9 + +# qhasm: r8 = a5 & b3 +# asm 1: vpand r8=reg256#11 +# asm 2: vpand r8=%ymm10 +vpand %ymm6,%ymm9,%ymm10 + +# qhasm: b2[0,1,2,3] = t1[0,1,0,1] +# asm 1: vpermq $0x44,b2=reg256#5 +# asm 2: vpermq $0x44,b2=%ymm4 +vpermq $0x44,%ymm4,%ymm4 + +# qhasm: r7 = a5 & b2 +# asm 1: vpand r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm6,%ymm4,%ymm11 + +# qhasm: b1[0,1,2,3] = t0[2,3,2,3] +# asm 1: vpermq $0xee,b1=reg256#13 +# asm 2: vpermq $0xee,b1=%ymm12 +vpermq $0xee,%ymm3,%ymm12 + +# qhasm: r6 = a5 & b1 +# asm 1: vpand r6=reg256#14 +# asm 2: vpand r6=%ymm13 +vpand %ymm6,%ymm12,%ymm13 + +# qhasm: b0[0,1,2,3] = t0[0,1,0,1] +# asm 1: vpermq $0x44,b0=reg256#4 +# asm 2: vpermq $0x44,b0=%ymm3 +vpermq $0x44,%ymm3,%ymm3 + +# qhasm: r5 = a5 & b0 +# asm 1: vpand r5=reg256#7 +# asm 2: vpand r5=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: a4[0,1,2,3] = s2[0,0,1,1] +# asm 1: vpermq $0x50,a4=reg256#3 +# asm 2: vpermq $0x50,a4=%ymm2 +vpermq $0x50,%ymm2,%ymm2 + +# qhasm: r = a4 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm2,%ymm7,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm5,%ymm8 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm9,%ymm8 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm4,%ymm8 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm12,%ymm8 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#3 +# asm 2: vpand r4=%ymm2 +vpand %ymm2,%ymm3,%ymm2 + +# qhasm: a3[0,1,2,3] = s1[2,2,3,3] +# asm 1: vpermq $0xfa,a3=reg256#9 +# asm 2: vpermq $0xfa,a3=%ymm8 +vpermq $0xfa,%ymm1,%ymm8 + +# qhasm: r = a3 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm8,%ymm7,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm5,%ymm10 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm9,%ymm10 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm4,%ymm10 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm12,%ymm10 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#9 +# asm 2: vpand r3=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: a2[0,1,2,3] = s1[0,0,1,1] +# asm 1: vpermq $0x50,a2=reg256#2 +# asm 2: vpermq $0x50,a2=%ymm1 +vpermq $0x50,%ymm1,%ymm1 + +# qhasm: r = a2 & b5 +# asm 1: vpand r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm7,%ymm10 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm5,%ymm10 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm9,%ymm10 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm4,%ymm10 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm12,%ymm10 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#2 +# asm 2: vpand r2=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: a1[0,1,2,3] = s0[2,2,3,3] +# asm 1: vpermq $0xfa,a1=reg256#11 +# asm 2: vpermq $0xfa,a1=%ymm10 +vpermq $0xfa,%ymm0,%ymm10 + +# qhasm: r = a1 & b5 +# asm 1: vpand r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm7,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm5,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm9,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm4,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm12,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#11 +# asm 2: vpand r1=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: a0[0,1,2,3] = s0[0,0,1,1] +# asm 1: vpermq $0x50,a0=reg256#1 +# asm 2: vpermq $0x50,a0=%ymm0 +vpermq $0x50,%ymm0,%ymm0 + +# qhasm: r = a0 & b5 +# asm 1: vpand r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm0,%ymm7,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm0,%ymm5,%ymm5 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm0,%ymm9,%ymm5 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#5 +# asm 2: vpand r=%ymm4 +vpand %ymm0,%ymm4,%ymm4 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#5 +# asm 2: vpand r=%ymm4 +vpand %ymm0,%ymm12,%ymm4 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: mem256[ ptr + 128 ] = r4 +# asm 1: vmovupd h22=int64#2 +# asm 2: movq 344(h22=%rsi +movq 344(%rcx),%rsi + +# qhasm: h13 = h22 +# asm 1: mov h13=int64#3 +# asm 2: mov h13=%rdx +mov %rsi,%rdx + +# qhasm: h10 = h22 +# asm 1: mov h10=int64#2 +# asm 2: mov h10=%rsi +mov %rsi,%rsi + +# qhasm: h21 = mem64[ ptr + 336 ] +# asm 1: movq 336(h21=int64#5 +# asm 2: movq 336(h21=%r8 +movq 336(%rcx),%r8 + +# qhasm: h21 ^= *(uint64 *) ( ptr + 328 ) +# asm 1: xorq 328(h12=int64#6 +# asm 2: mov h12=%r9 +mov %r8,%r9 + +# qhasm: h9 = h21 +# asm 1: mov h9=int64#5 +# asm 2: mov h9=%r8 +mov %r8,%r8 + +# qhasm: h20 = mem64[ ptr + 312 ] +# asm 1: movq 312(h20=int64#7 +# asm 2: movq 312(h20=%rax +movq 312(%rcx),%rax + +# qhasm: h20 ^= *(uint64 *) ( ptr + 320 ) +# asm 1: xorq 320(h11=int64#8 +# asm 2: mov h11=%r10 +mov %rax,%r10 + +# qhasm: h8 = h20 +# asm 1: mov h8=int64#7 +# asm 2: mov h8=%rax +mov %rax,%rax + +# qhasm: h19 = mem64[ ptr + 304 ] +# asm 1: movq 304(h19=int64#9 +# asm 2: movq 304(h19=%r11 +movq 304(%rcx),%r11 + +# qhasm: h19 ^= *(uint64 *) ( ptr + 296 ) +# asm 1: xorq 296(h7=int64#9 +# asm 2: mov h7=%r11 +mov %r11,%r11 + +# qhasm: h18 = mem64[ ptr + 280 ] +# asm 1: movq 280(h18=int64#10 +# asm 2: movq 280(h18=%r12 +movq 280(%rcx),%r12 + +# qhasm: h18 ^= *(uint64 *) ( ptr + 288 ) +# asm 1: xorq 288(h6=int64#10 +# asm 2: mov h6=%r12 +mov %r12,%r12 + +# qhasm: h17 = mem64[ ptr + 272 ] +# asm 1: movq 272(h17=int64#11 +# asm 2: movq 272(h17=%r13 +movq 272(%rcx),%r13 + +# qhasm: h17 ^= *(uint64 *) ( ptr + 264 ) +# asm 1: xorq 264(h5=int64#11 +# asm 2: mov h5=%r13 +mov %r13,%r13 + +# qhasm: h16 = mem64[ ptr + 248 ] +# asm 1: movq 248(h16=int64#12 +# asm 2: movq 248(h16=%r14 +movq 248(%rcx),%r14 + +# qhasm: h16 ^= *(uint64 *) ( ptr + 256 ) +# asm 1: xorq 256(h4=int64#12 +# asm 2: mov h4=%r14 +mov %r14,%r14 + +# qhasm: h15 = mem64[ ptr + 240 ] +# asm 1: movq 240(h15=int64#13 +# asm 2: movq 240(h15=%r15 +movq 240(%rcx),%r15 + +# qhasm: h15 ^= *(uint64 *) ( ptr + 232 ) +# asm 1: xorq 232(h3=int64#13 +# asm 2: mov h3=%r15 +mov %r15,%r15 + +# qhasm: h14 = mem64[ ptr + 216 ] +# asm 1: movq 216(h14=int64#14 +# asm 2: movq 216(h14=%rbx +movq 216(%rcx),%rbx + +# qhasm: h14 ^= *(uint64 *) ( ptr + 224 ) +# asm 1: xorq 224(h2=int64#14 +# asm 2: mov h2=%rbx +mov %rbx,%rbx + +# qhasm: h13 ^= *(uint64 *) ( ptr + 208 ) +# asm 1: xorq 208(h1=int64#3 +# asm 2: mov h1=%rdx +mov %rdx,%rdx + +# qhasm: h12 ^= *(uint64 *) ( ptr + 184 ) +# asm 1: xorq 184(h0=int64#6 +# asm 2: mov h0=%r9 +mov %r9,%r9 + +# qhasm: h11 ^= *(uint64 *) ( ptr + 176 ) +# asm 1: xorq 176(caller_r11=int64#9 +# asm 2: movq caller_r11=%r11 +movq 608(%rsp),%r11 + +# qhasm: caller_r12 = r12_stack +# asm 1: movq caller_r12=int64#10 +# asm 2: movq caller_r12=%r12 +movq 616(%rsp),%r12 + +# qhasm: caller_r13 = r13_stack +# asm 1: movq caller_r13=int64#11 +# asm 2: movq caller_r13=%r13 +movq 624(%rsp),%r13 + +# qhasm: caller_r14 = r14_stack +# asm 1: movq caller_r14=int64#12 +# asm 2: movq caller_r14=%r14 +movq 632(%rsp),%r14 + +# qhasm: caller_r15 = r15_stack +# asm 1: movq caller_r15=int64#13 +# asm 2: movq caller_r15=%r15 +movq 640(%rsp),%r15 + +# qhasm: caller_rbx = rbx_stack +# asm 1: movq caller_rbx=int64#14 +# asm 2: movq caller_rbx=%rbx +movq 648(%rsp),%rbx + +# qhasm: return +add %r11,%rsp +ret diff --git a/crypto_kem/mceliece348864/avx/vec_mul_sp_asm.S b/crypto_kem/mceliece348864/avx/vec_mul_sp_asm.S new file mode 100644 index 00000000..0df3f521 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/vec_mul_sp_asm.S @@ -0,0 +1,1115 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 s0 + +# qhasm: reg256 s1 + +# qhasm: reg256 s2 + +# qhasm: reg256 s3 + +# qhasm: reg256 s4 + +# qhasm: reg256 s5 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r + +# qhasm: int64 h0 + +# qhasm: int64 h1 + +# qhasm: int64 h2 + +# qhasm: int64 h3 + +# qhasm: int64 h4 + +# qhasm: int64 h5 + +# qhasm: int64 h6 + +# qhasm: int64 h7 + +# qhasm: int64 h8 + +# qhasm: int64 h9 + +# qhasm: int64 h10 + +# qhasm: int64 h11 + +# qhasm: int64 h12 + +# qhasm: int64 h13 + +# qhasm: int64 h14 + +# qhasm: int64 h15 + +# qhasm: int64 h16 + +# qhasm: int64 h17 + +# qhasm: int64 h18 + +# qhasm: int64 h19 + +# qhasm: int64 h20 + +# qhasm: int64 h21 + +# qhasm: int64 h22 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: stack64 r11_stack + +# qhasm: stack64 r12_stack + +# qhasm: stack64 r13_stack + +# qhasm: stack64 r14_stack + +# qhasm: stack64 r15_stack + +# qhasm: stack64 rbx_stack + +# qhasm: stack64 rbp_stack + +# qhasm: enter vec_mul_sp_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm +.global PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm +_PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm: +PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm: +mov %rsp,%r11 +and $31,%r11 +add $672,%r11 +sub %r11,%rsp + +# qhasm: r11_stack = caller_r11 +# asm 1: movq r11_stack=stack64#1 +# asm 2: movq r11_stack=608(%rsp) +movq %r11,608(%rsp) + +# qhasm: r12_stack = caller_r12 +# asm 1: movq r12_stack=stack64#2 +# asm 2: movq r12_stack=616(%rsp) +movq %r12,616(%rsp) + +# qhasm: r13_stack = caller_r13 +# asm 1: movq r13_stack=stack64#3 +# asm 2: movq r13_stack=624(%rsp) +movq %r13,624(%rsp) + +# qhasm: r14_stack = caller_r14 +# asm 1: movq r14_stack=stack64#4 +# asm 2: movq r14_stack=632(%rsp) +movq %r14,632(%rsp) + +# qhasm: r15_stack = caller_r15 +# asm 1: movq r15_stack=stack64#5 +# asm 2: movq r15_stack=640(%rsp) +movq %r15,640(%rsp) + +# qhasm: rbx_stack = caller_rbx +# asm 1: movq rbx_stack=stack64#6 +# asm 2: movq rbx_stack=648(%rsp) +movq %rbx,648(%rsp) + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#4 +# asm 2: leaq ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: s0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(s0=reg256#1 +# asm 2: vmovupd 0(s0=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: s1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(s1=reg256#2 +# asm 2: vmovupd 32(s1=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: s2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(s2=reg256#3 +# asm 2: vmovupd 64(s2=%ymm2 +vmovupd 64(%rsi),%ymm2 + +# qhasm: a5[0,1,2,3] = s2[2,2,3,3] +# asm 1: vpermq $0xfa,a5=reg256#4 +# asm 2: vpermq $0xfa,a5=%ymm3 +vpermq $0xfa,%ymm2,%ymm3 + +# qhasm: r = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(r=reg256#5 +# asm 2: vmovupd 160(r=%ymm4 +vmovupd 160(%rdx),%ymm4 + +# qhasm: b5[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b5=reg256#5 +# asm 2: vpermq $0xdd,b5=%ymm4 +vpermq $0xdd,%ymm4,%ymm4 + +# qhasm: r10 = a5 & b5 +# asm 1: vpand r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm3,%ymm4,%ymm5 + +# qhasm: mem256[ ptr + 320 ] = r10 +# asm 1: vmovupd r=reg256#6 +# asm 2: vmovupd 128(r=%ymm5 +vmovupd 128(%rdx),%ymm5 + +# qhasm: b4[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b4=reg256#6 +# asm 2: vpermq $0xdd,b4=%ymm5 +vpermq $0xdd,%ymm5,%ymm5 + +# qhasm: r9 = a5 & b4 +# asm 1: vpand r9=reg256#7 +# asm 2: vpand r9=%ymm6 +vpand %ymm3,%ymm5,%ymm6 + +# qhasm: r = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(r=reg256#8 +# asm 2: vmovupd 96(r=%ymm7 +vmovupd 96(%rdx),%ymm7 + +# qhasm: b3[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b3=reg256#8 +# asm 2: vpermq $0xdd,b3=%ymm7 +vpermq $0xdd,%ymm7,%ymm7 + +# qhasm: r8 = a5 & b3 +# asm 1: vpand r8=reg256#9 +# asm 2: vpand r8=%ymm8 +vpand %ymm3,%ymm7,%ymm8 + +# qhasm: r = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(r=reg256#10 +# asm 2: vmovupd 64(r=%ymm9 +vmovupd 64(%rdx),%ymm9 + +# qhasm: b2[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b2=reg256#10 +# asm 2: vpermq $0xdd,b2=%ymm9 +vpermq $0xdd,%ymm9,%ymm9 + +# qhasm: r7 = a5 & b2 +# asm 1: vpand r7=reg256#11 +# asm 2: vpand r7=%ymm10 +vpand %ymm3,%ymm9,%ymm10 + +# qhasm: r = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(r=reg256#12 +# asm 2: vmovupd 32(r=%ymm11 +vmovupd 32(%rdx),%ymm11 + +# qhasm: b1[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b1=reg256#12 +# asm 2: vpermq $0xdd,b1=%ymm11 +vpermq $0xdd,%ymm11,%ymm11 + +# qhasm: r6 = a5 & b1 +# asm 1: vpand r6=reg256#13 +# asm 2: vpand r6=%ymm12 +vpand %ymm3,%ymm11,%ymm12 + +# qhasm: r = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(r=reg256#14 +# asm 2: vmovupd 0(r=%ymm13 +vmovupd 0(%rdx),%ymm13 + +# qhasm: b0[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b0=reg256#14 +# asm 2: vpermq $0xdd,b0=%ymm13 +vpermq $0xdd,%ymm13,%ymm13 + +# qhasm: r5 = a5 & b0 +# asm 1: vpand r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm3,%ymm13,%ymm3 + +# qhasm: a4[0,1,2,3] = s2[0,0,1,1] +# asm 1: vpermq $0x50,a4=reg256#3 +# asm 2: vpermq $0x50,a4=%ymm2 +vpermq $0x50,%ymm2,%ymm2 + +# qhasm: r = a4 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm2,%ymm4,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm5,%ymm6 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm7,%ymm6 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm9,%ymm6 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm11,%ymm6 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#3 +# asm 2: vpand r4=%ymm2 +vpand %ymm2,%ymm13,%ymm2 + +# qhasm: a3[0,1,2,3] = s1[2,2,3,3] +# asm 1: vpermq $0xfa,a3=reg256#7 +# asm 2: vpermq $0xfa,a3=%ymm6 +vpermq $0xfa,%ymm1,%ymm6 + +# qhasm: r = a3 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm6,%ymm4,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm5,%ymm8 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm7,%ymm8 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm9,%ymm8 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm11,%ymm8 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vpand r3=%ymm6 +vpand %ymm6,%ymm13,%ymm6 + +# qhasm: a2[0,1,2,3] = s1[0,0,1,1] +# asm 1: vpermq $0x50,a2=reg256#2 +# asm 2: vpermq $0x50,a2=%ymm1 +vpermq $0x50,%ymm1,%ymm1 + +# qhasm: r = a2 & b5 +# asm 1: vpand r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm4,%ymm8 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm5,%ymm8 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm7,%ymm8 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm9,%ymm8 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm11,%ymm8 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#2 +# asm 2: vpand r2=%ymm1 +vpand %ymm1,%ymm13,%ymm1 + +# qhasm: a1[0,1,2,3] = s0[2,2,3,3] +# asm 1: vpermq $0xfa,a1=reg256#9 +# asm 2: vpermq $0xfa,a1=%ymm8 +vpermq $0xfa,%ymm0,%ymm8 + +# qhasm: r = a1 & b5 +# asm 1: vpand r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm4,%ymm10 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm5,%ymm10 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm7,%ymm10 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm9,%ymm10 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm11,%ymm10 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#9 +# asm 2: vpand r1=%ymm8 +vpand %ymm8,%ymm13,%ymm8 + +# qhasm: a0[0,1,2,3] = s0[0,0,1,1] +# asm 1: vpermq $0x50,a0=reg256#1 +# asm 2: vpermq $0x50,a0=%ymm0 +vpermq $0x50,%ymm0,%ymm0 + +# qhasm: r = a0 & b5 +# asm 1: vpand r=reg256#5 +# asm 2: vpand r=%ymm4 +vpand %ymm0,%ymm4,%ymm4 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm5,%ymm3 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm7,%ymm3 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm9,%ymm3 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm11,%ymm3 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm0,%ymm13,%ymm0 + +# qhasm: mem256[ ptr + 128 ] = r4 +# asm 1: vmovupd h22=int64#2 +# asm 2: movq 344(h22=%rsi +movq 344(%rcx),%rsi + +# qhasm: h13 = h22 +# asm 1: mov h13=int64#3 +# asm 2: mov h13=%rdx +mov %rsi,%rdx + +# qhasm: h10 = h22 +# asm 1: mov h10=int64#2 +# asm 2: mov h10=%rsi +mov %rsi,%rsi + +# qhasm: h21 = mem64[ ptr + 336 ] +# asm 1: movq 336(h21=int64#5 +# asm 2: movq 336(h21=%r8 +movq 336(%rcx),%r8 + +# qhasm: h21 ^= *(uint64 *) ( ptr + 328 ) +# asm 1: xorq 328(h12=int64#6 +# asm 2: mov h12=%r9 +mov %r8,%r9 + +# qhasm: h9 = h21 +# asm 1: mov h9=int64#5 +# asm 2: mov h9=%r8 +mov %r8,%r8 + +# qhasm: h20 = mem64[ ptr + 312 ] +# asm 1: movq 312(h20=int64#7 +# asm 2: movq 312(h20=%rax +movq 312(%rcx),%rax + +# qhasm: h20 ^= *(uint64 *) ( ptr + 320 ) +# asm 1: xorq 320(h11=int64#8 +# asm 2: mov h11=%r10 +mov %rax,%r10 + +# qhasm: h8 = h20 +# asm 1: mov h8=int64#7 +# asm 2: mov h8=%rax +mov %rax,%rax + +# qhasm: h19 = mem64[ ptr + 304 ] +# asm 1: movq 304(h19=int64#9 +# asm 2: movq 304(h19=%r11 +movq 304(%rcx),%r11 + +# qhasm: h19 ^= *(uint64 *) ( ptr + 296 ) +# asm 1: xorq 296(h7=int64#9 +# asm 2: mov h7=%r11 +mov %r11,%r11 + +# qhasm: h18 = mem64[ ptr + 280 ] +# asm 1: movq 280(h18=int64#10 +# asm 2: movq 280(h18=%r12 +movq 280(%rcx),%r12 + +# qhasm: h18 ^= *(uint64 *) ( ptr + 288 ) +# asm 1: xorq 288(h6=int64#10 +# asm 2: mov h6=%r12 +mov %r12,%r12 + +# qhasm: h17 = mem64[ ptr + 272 ] +# asm 1: movq 272(h17=int64#11 +# asm 2: movq 272(h17=%r13 +movq 272(%rcx),%r13 + +# qhasm: h17 ^= *(uint64 *) ( ptr + 264 ) +# asm 1: xorq 264(h5=int64#11 +# asm 2: mov h5=%r13 +mov %r13,%r13 + +# qhasm: h16 = mem64[ ptr + 248 ] +# asm 1: movq 248(h16=int64#12 +# asm 2: movq 248(h16=%r14 +movq 248(%rcx),%r14 + +# qhasm: h16 ^= *(uint64 *) ( ptr + 256 ) +# asm 1: xorq 256(h4=int64#12 +# asm 2: mov h4=%r14 +mov %r14,%r14 + +# qhasm: h15 = mem64[ ptr + 240 ] +# asm 1: movq 240(h15=int64#13 +# asm 2: movq 240(h15=%r15 +movq 240(%rcx),%r15 + +# qhasm: h15 ^= *(uint64 *) ( ptr + 232 ) +# asm 1: xorq 232(h3=int64#13 +# asm 2: mov h3=%r15 +mov %r15,%r15 + +# qhasm: h14 = mem64[ ptr + 216 ] +# asm 1: movq 216(h14=int64#14 +# asm 2: movq 216(h14=%rbx +movq 216(%rcx),%rbx + +# qhasm: h14 ^= *(uint64 *) ( ptr + 224 ) +# asm 1: xorq 224(h2=int64#14 +# asm 2: mov h2=%rbx +mov %rbx,%rbx + +# qhasm: h13 ^= *(uint64 *) ( ptr + 208 ) +# asm 1: xorq 208(h1=int64#3 +# asm 2: mov h1=%rdx +mov %rdx,%rdx + +# qhasm: h12 ^= *(uint64 *) ( ptr + 184 ) +# asm 1: xorq 184(h0=int64#6 +# asm 2: mov h0=%r9 +mov %r9,%r9 + +# qhasm: h11 ^= *(uint64 *) ( ptr + 176 ) +# asm 1: xorq 176(caller_r11=int64#9 +# asm 2: movq caller_r11=%r11 +movq 608(%rsp),%r11 + +# qhasm: caller_r12 = r12_stack +# asm 1: movq caller_r12=int64#10 +# asm 2: movq caller_r12=%r12 +movq 616(%rsp),%r12 + +# qhasm: caller_r13 = r13_stack +# asm 1: movq caller_r13=int64#11 +# asm 2: movq caller_r13=%r13 +movq 624(%rsp),%r13 + +# qhasm: caller_r14 = r14_stack +# asm 1: movq caller_r14=int64#12 +# asm 2: movq caller_r14=%r14 +movq 632(%rsp),%r14 + +# qhasm: caller_r15 = r15_stack +# asm 1: movq caller_r15=int64#13 +# asm 2: movq caller_r15=%r15 +movq 640(%rsp),%r15 + +# qhasm: caller_rbx = rbx_stack +# asm 1: movq caller_rbx=int64#14 +# asm 2: movq caller_rbx=%rbx +movq 648(%rsp),%rbx + +# qhasm: return +add %r11,%rsp +ret diff --git a/crypto_kem/mceliece348864/avx/vec_reduce_asm.S b/crypto_kem/mceliece348864/avx/vec_reduce_asm.S new file mode 100644 index 00000000..9f07f500 --- /dev/null +++ b/crypto_kem/mceliece348864/avx/vec_reduce_asm.S @@ -0,0 +1,356 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 t + +# qhasm: int64 c + +# qhasm: int64 r + +# qhasm: enter vec_reduce_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm +.global PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm +_PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm: +PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: r = 0 +# asm 1: mov $0,>r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t = mem64[ input_0 + 88 ] +# asm 1: movq 88(t=int64#2 +# asm 2: movq 88(t=%rsi +movq 88(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 80(t=%rsi +movq 80(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 72(t=%rsi +movq 72(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 64(t=%rsi +movq 64(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 56(t=%rsi +movq 56(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 48(t=%rsi +movq 48(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 40(t=%rsi +movq 40(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 32(t=%rsi +movq 32(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 24(t=%rsi +movq 24(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 16(t=%rsi +movq 16(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 8(t=%rsi +movq 8(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#1 +# asm 2: movq 0(t=%rdi +movq 0(%rdi),%rdi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rdi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE348864_CLEAN_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece348864/clean/api.h b/crypto_kem/mceliece348864/clean/api.h new file mode 100644 index 00000000..0bebdcb5 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_API_H +#define PQCLEAN_MCELIECE348864_CLEAN_API_H + +#include + +#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_ALGNAME "Classic McEliece 348864" +#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_PUBLICKEYBYTES 261120 +#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_SECRETKEYBYTES 6452 +#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_CIPHERTEXTBYTES 128 +#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece348864/clean/benes.c b/crypto_kem/mceliece348864/clean/benes.c new file mode 100644 index 00000000..39f639a1 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/benes.c @@ -0,0 +1,139 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +/* one layer of the benes network */ +static void layer(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE348864_CLEAN_apply_benes(unsigned char *r, const unsigned char *bits, int rev) { + int i; + + const unsigned char *cond_ptr; + int inc, low; + + uint64_t bs[64]; + uint64_t cond[64]; + + // + + for (i = 0; i < 64; i++) { + bs[i] = PQCLEAN_MCELIECE348864_CLEAN_load8(r + i * 8); + } + + if (rev == 0) { + inc = 256; + cond_ptr = bits; + } else { + inc = -256; + cond_ptr = bits + (2 * GFBITS - 2) * 256; + } + + // + + PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs); + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load4(cond_ptr + i * 4); + } + PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(cond, cond); + layer(bs, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs); + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 32; i++) { + cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load8(cond_ptr + i * 8); + } + layer(bs, cond, low); + cond_ptr += inc; + } + for (low = 4; low >= 0; low--) { + for (i = 0; i < 32; i++) { + cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load8(cond_ptr + i * 8); + } + layer(bs, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs); + + for (low = 5; low >= 0; low--) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load4(cond_ptr + i * 4); + } + PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(cond, cond); + layer(bs, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs); + + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE348864_CLEAN_store8(r + i * 8, bs[i]); + } +} + +/* input: condition bits c */ +/* output: support s */ +void PQCLEAN_MCELIECE348864_CLEAN_support_gen(gf *s, const unsigned char *c) { + gf a; + int i, j; + unsigned char L[ GFBITS ][ (1 << GFBITS) / 8 ]; + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < (1 << GFBITS) / 8; j++) { + L[i][j] = 0; + } + } + + for (i = 0; i < (1 << GFBITS); i++) { + a = PQCLEAN_MCELIECE348864_CLEAN_bitrev((gf) i); + + for (j = 0; j < GFBITS; j++) { + L[j][ i / 8 ] |= ((a >> j) & 1) << (i % 8); + } + } + + for (j = 0; j < GFBITS; j++) { + PQCLEAN_MCELIECE348864_CLEAN_apply_benes(L[j], c, 0); + } + + for (i = 0; i < SYS_N; i++) { + s[i] = 0; + for (j = GFBITS - 1; j >= 0; j--) { + s[i] <<= 1; + s[i] |= (L[j][i / 8] >> (i % 8)) & 1; + } + } +} + diff --git a/crypto_kem/mceliece348864/clean/benes.h b/crypto_kem/mceliece348864/clean/benes.h new file mode 100644 index 00000000..29fc2740 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_BENES_H +#define PQCLEAN_MCELIECE348864_CLEAN_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "gf.h" + +void PQCLEAN_MCELIECE348864_CLEAN_apply_benes(unsigned char * /*r*/, const unsigned char * /*bits*/, int /*rev*/); +void PQCLEAN_MCELIECE348864_CLEAN_support_gen(gf * /*s*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece348864/clean/bm.c b/crypto_kem/mceliece348864/clean/bm.c new file mode 100644 index 00000000..89b8ed4a --- /dev/null +++ b/crypto_kem/mceliece348864/clean/bm.c @@ -0,0 +1,83 @@ +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ +#include "bm.h" + +#include "params.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +/* the Berlekamp-Massey algorithm */ +/* input: s, sequence of field elements */ +/* output: out, minimal polynomial of s */ +void PQCLEAN_MCELIECE348864_CLEAN_bm(gf *out, gf *s) { + int i; + + uint16_t N = 0; + uint16_t L = 0; + uint16_t mle; + uint16_t mne; + + gf T[ SYS_T + 1 ]; + gf C[ SYS_T + 1 ]; + gf B[ SYS_T + 1 ]; + + gf b = 1, d, f; + + // + + for (i = 0; i < SYS_T + 1; i++) { + C[i] = B[i] = 0; + } + + B[1] = C[0] = 1; + + // + + for (N = 0; N < 2 * SYS_T; N++) { + d = 0; + + for (i = 0; i <= min(N, SYS_T); i++) { + d ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(C[i], s[ N - i]); + } + + mne = d; + mne -= 1; + mne >>= 15; + mne -= 1; + mle = N; + mle -= 2 * L; + mle >>= 15; + mle -= 1; + mle &= mne; + + for (i = 0; i <= SYS_T; i++) { + T[i] = C[i]; + } + + f = PQCLEAN_MCELIECE348864_CLEAN_gf_frac(b, d); + + for (i = 0; i <= SYS_T; i++) { + C[i] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(f, B[i]) & mne; + } + + L = (L & ~mle) | ((N + 1 - L) & mle); + + for (i = 0; i <= SYS_T; i++) { + B[i] = (B[i] & ~mle) | (T[i] & mle); + } + + b = (b & ~mle) | (d & mle); + + for (i = SYS_T; i >= 1; i--) { + B[i] = B[i - 1]; + } + B[0] = 0; + } + + for (i = 0; i <= SYS_T; i++) { + out[i] = C[ SYS_T - i ]; + } +} + diff --git a/crypto_kem/mceliece348864/clean/bm.h b/crypto_kem/mceliece348864/clean/bm.h new file mode 100644 index 00000000..c7da4878 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/bm.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_BM_H +#define PQCLEAN_MCELIECE348864_CLEAN_BM_H +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE348864_CLEAN_bm(gf * /*out*/, gf * /*s*/); + +#endif + diff --git a/crypto_kem/mceliece348864/clean/controlbits.c b/crypto_kem/mceliece348864/clean/controlbits.c new file mode 100644 index 00000000..7b3444e3 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE348864_CLEAN_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE348864_CLEAN_sort_63b(n / 2, x); + PQCLEAN_MCELIECE348864_CLEAN_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE348864_CLEAN_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece348864/clean/controlbits.h b/crypto_kem/mceliece348864/clean/controlbits.h new file mode 100644 index 00000000..3125a862 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_CONTROLBITS_H +#define PQCLEAN_MCELIECE348864_CLEAN_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE348864_CLEAN_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE348864_CLEAN_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece348864/clean/crypto_hash.h b/crypto_kem/mceliece348864/clean/crypto_hash.h new file mode 100644 index 00000000..110ecfc9 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece348864/clean/decrypt.c b/crypto_kem/mceliece348864/clean/decrypt.c new file mode 100644 index 00000000..d180c5cd --- /dev/null +++ b/crypto_kem/mceliece348864/clean/decrypt.c @@ -0,0 +1,90 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "gf.h" +#include "params.h" +#include "root.h" +#include "synd.h" +#include "util.h" + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE348864_CLEAN_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i, w = 0; + uint16_t check; + + unsigned char r[ SYS_N / 8 ]; + + gf g[ SYS_T + 1 ]; + gf L[ SYS_N ]; + + gf s[ SYS_T * 2 ]; + gf s_cmp[ SYS_T * 2 ]; + gf locator[ SYS_T + 1 ]; + gf images[ SYS_N ]; + + gf t; + + // + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = c[i]; + } + for (i = SYND_BYTES; i < SYS_N / 8; i++) { + r[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE348864_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + g[ SYS_T ] = 1; + + PQCLEAN_MCELIECE348864_CLEAN_support_gen(L, sk); + + PQCLEAN_MCELIECE348864_CLEAN_synd(s, g, L, r); + + PQCLEAN_MCELIECE348864_CLEAN_bm(locator, s); + + PQCLEAN_MCELIECE348864_CLEAN_root(images, locator, L); + + // + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + } + + for (i = 0; i < SYS_N; i++) { + t = PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(images[i]) & 1; + + e[ i / 8 ] |= t << (i % 8); + w += t; + + } + + PQCLEAN_MCELIECE348864_CLEAN_synd(s_cmp, g, L, e); + + // + + check = (uint16_t)w; + check ^= SYS_T; + + for (i = 0; i < SYS_T * 2; i++) { + check |= s[i] ^ s_cmp[i]; + } + + check -= 1; + check >>= 15; + + return check ^ 1; +} + diff --git a/crypto_kem/mceliece348864/clean/decrypt.h b/crypto_kem/mceliece348864/clean/decrypt.h new file mode 100644 index 00000000..4a80e068 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_DECRYPT_H +#define PQCLEAN_MCELIECE348864_CLEAN_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE348864_CLEAN_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece348864/clean/encrypt.c b/crypto_kem/mceliece348864/clean/encrypt.c new file mode 100644 index 00000000..27a6ea4f --- /dev/null +++ b/crypto_kem/mceliece348864/clean/encrypt.c @@ -0,0 +1,138 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include + +#include "gf.h" + +static inline uint8_t same_mask(uint16_t x, uint16_t y) { + uint32_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 31; + mask = -mask; + + return (uint8_t)mask; +} + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint8_t *ind_8 = (uint8_t *)ind_; + uint16_t ind[ SYS_T * 2 ]; + uint8_t mask; + unsigned char val[ SYS_T ]; + + while (1) { + randombytes(ind_8, sizeof(ind_)); + // Copy to uint16_t ind_ in a little-endian way + for (i = 0; i < sizeof(ind_); i += 2) { + ind_[i / 2] = ((uint16_t)ind_8[i + 1]) << 8 | (uint16_t)ind_8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = 1 << (ind[j] & 7); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = same_mask((uint16_t)i, (ind[j] >> 3)); + + e[i] |= val[j] & mask; + } + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + unsigned char b, row[SYS_N / 8]; + const unsigned char *pk_ptr = pk; + + int i, j; + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = 0; + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + row[j] = 0; + } + + for (j = 0; j < PK_ROW_BYTES; j++) { + row[ SYS_N / 8 - PK_ROW_BYTES + j ] = pk_ptr[j]; + } + + row[i / 8] |= 1 << (i % 8); + + b = 0; + for (j = 0; j < SYS_N / 8; j++) { + b ^= row[j] & e[j]; + } + + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] |= (b << (i % 8)); + + pk_ptr += PK_ROW_BYTES; + } +} + +void PQCLEAN_MCELIECE348864_CLEAN_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece348864/clean/encrypt.h b/crypto_kem/mceliece348864/clean/encrypt.h new file mode 100644 index 00000000..2b6daf86 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_ENCRYPT_H +#define PQCLEAN_MCELIECE348864_CLEAN_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE348864_CLEAN_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece348864/clean/gf.c b/crypto_kem/mceliece348864/clean/gf.c new file mode 100644 index 00000000..d974bf60 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/gf.c @@ -0,0 +1,139 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE348864_CLEAN_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE348864_CLEAN_gf_mul(gf in0, gf in1) { + int i; + + uint32_t tmp; + uint32_t t0; + uint32_t t1; + uint32_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & (1 << i))); + } + + t = tmp & 0x7FC000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + t = tmp & 0x3000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + return tmp & ((1 << GFBITS) - 1); +} + +/* input: field element in */ +/* return: in^2 */ +static inline gf gf_sq(gf in) { + const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; + + uint32_t x = in; + uint32_t t; + + x = (x | (x << 8)) & B[3]; + x = (x | (x << 4)) & B[2]; + x = (x | (x << 2)) & B[1]; + x = (x | (x << 1)) & B[0]; + + t = x & 0x7FC000; + x ^= t >> 9; + x ^= t >> 12; + + t = x & 0x3000; + x ^= t >> 9; + x ^= t >> 12; + + return x & ((1 << GFBITS) - 1); +} + +gf PQCLEAN_MCELIECE348864_CLEAN_gf_inv(gf in) { + gf tmp_11; + gf tmp_1111; + + gf out = in; + + out = gf_sq(out); + tmp_11 = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, in); // 11 + + out = gf_sq(tmp_11); + out = gf_sq(out); + tmp_1111 = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, tmp_11); // 1111 + + out = gf_sq(tmp_1111); + out = gf_sq(out); + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, tmp_1111); // 11111111 + + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, tmp_11); // 1111111111 + + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, in); // 11111111111 + + return gf_sq(out); // 111111111110 +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE348864_CLEAN_gf_frac(gf den, gf num) { + return PQCLEAN_MCELIECE348864_CLEAN_gf_mul(PQCLEAN_MCELIECE348864_CLEAN_gf_inv(den), num); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE348864_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 877); + prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 2888); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 1781); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 373); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece348864/clean/gf.h b/crypto_kem/mceliece348864/clean/gf.h new file mode 100644 index 00000000..c445925a --- /dev/null +++ b/crypto_kem/mceliece348864/clean/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_GF_H +#define PQCLEAN_MCELIECE348864_CLEAN_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(gf a); +gf PQCLEAN_MCELIECE348864_CLEAN_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE348864_CLEAN_gf_mul(gf in0, gf in1); +gf PQCLEAN_MCELIECE348864_CLEAN_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE348864_CLEAN_gf_inv(gf in); +uint64_t PQCLEAN_MCELIECE348864_CLEAN_gf_mul2(gf a, gf b0, gf b1); + +void PQCLEAN_MCELIECE348864_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece348864/clean/operations.c b/crypto_kem/mceliece348864/clean/operations.c new file mode 100644 index 00000000..3a222d77 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE348864_CLEAN_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864_CLEAN_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE348864_CLEAN_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE348864_CLEAN_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE348864_CLEAN_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE348864_CLEAN_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE348864_CLEAN_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE348864_CLEAN_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE348864_CLEAN_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE348864_CLEAN_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864/clean/params.h b/crypto_kem/mceliece348864/clean/params.h new file mode 100644 index 00000000..ae8aaa91 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_PARAMS_H +#define PQCLEAN_MCELIECE348864_CLEAN_PARAMS_H + +#define GFBITS 12 +#define SYS_N 3488 +#define SYS_T 64 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece348864/clean/pk_gen.c b/crypto_kem/mceliece348864/clean/pk_gen.c new file mode 100644 index 00000000..eec02be0 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/pk_gen.c @@ -0,0 +1,144 @@ +/* + This file is for public-key generation +*/ + +#include + +#include "benes.h" +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "pk_gen.h" +#include "root.h" +#include "util.h" + +/* input: secret key sk */ +/* output: public key pk */ +int PQCLEAN_MCELIECE348864_CLEAN_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) { + int i, j, k; + int row, c; + + uint64_t buf[ 1 << GFBITS ]; + + uint8_t mat[ GFBITS * SYS_T ][ SYS_N / 8 ]; + uint8_t mask; + uint8_t b; + + gf g[ SYS_T + 1 ]; // Goppa polynomial + gf L[ SYS_N ]; // support + gf inv[ SYS_N ]; + + // + + g[ SYS_T ] = 1; + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE348864_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + + for (i = 0; i < (1 << GFBITS); i++) { + buf[i] = perm[i]; + buf[i] <<= 31; + buf[i] |= i; + } + + PQCLEAN_MCELIECE348864_CLEAN_sort_63b(1 << GFBITS, buf); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = buf[i] & GFMASK; + } + for (i = 0; i < SYS_N; i++) { + L[i] = PQCLEAN_MCELIECE348864_CLEAN_bitrev((gf)perm[i]); + } + + // filling the matrix + + PQCLEAN_MCELIECE348864_CLEAN_root(inv, g, L); + + for (i = 0; i < SYS_N; i++) { + inv[i] = PQCLEAN_MCELIECE348864_CLEAN_gf_inv(inv[i]); + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + mat[i][j] = 0; + } + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_N; j += 8) { + for (k = 0; k < GFBITS; k++) { + b = (inv[j + 7] >> k) & 1; + b <<= 1; + b |= (inv[j + 6] >> k) & 1; + b <<= 1; + b |= (inv[j + 5] >> k) & 1; + b <<= 1; + b |= (inv[j + 4] >> k) & 1; + b <<= 1; + b |= (inv[j + 3] >> k) & 1; + b <<= 1; + b |= (inv[j + 2] >> k) & 1; + b <<= 1; + b |= (inv[j + 1] >> k) & 1; + b <<= 1; + b |= (inv[j + 0] >> k) & 1; + + mat[ i * GFBITS + k ][ j / 8 ] = b; + } + } + + for (j = 0; j < SYS_N; j++) { + inv[j] = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(inv[j], L[j]); + } + + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) { + for (j = 0; j < 8; j++) { + row = i * 8 + j; + + if (row >= GFBITS * SYS_T) { + break; + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] ^ mat[ k ][ i ]; + mask >>= j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < GFBITS * SYS_T; k++) { + if (k != row) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + } + + for (i = 0; i < PK_NROWS; i++) { + memcpy(pk + i * PK_ROW_BYTES, mat[i] + PK_NROWS / 8, PK_ROW_BYTES); + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864/clean/pk_gen.h b/crypto_kem/mceliece348864/clean/pk_gen.h new file mode 100644 index 00000000..e92992f5 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_PK_GEN_H +#define PQCLEAN_MCELIECE348864_CLEAN_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE348864_CLEAN_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece348864/clean/root.c b/crypto_kem/mceliece348864/clean/root.c new file mode 100644 index 00000000..a57f215c --- /dev/null +++ b/crypto_kem/mceliece348864/clean/root.c @@ -0,0 +1,33 @@ +/* + This file is for evaluating a polynomial at one or more field elements +*/ +#include "root.h" + +#include "params.h" + +/* input: polynomial f and field element a */ +/* return f(a) */ +gf PQCLEAN_MCELIECE348864_CLEAN_eval(gf *f, gf a) { + int i; + gf r; + + r = f[ SYS_T ]; + + for (i = SYS_T - 1; i >= 0; i--) { + r = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(r, a); + r = PQCLEAN_MCELIECE348864_CLEAN_gf_add(r, f[i]); + } + + return r; +} + +/* input: polynomial f and list of field elements L */ +/* output: out = [ f(a) for a in L ] */ +void PQCLEAN_MCELIECE348864_CLEAN_root(gf *out, gf *f, gf *L) { + int i; + + for (i = 0; i < SYS_N; i++) { + out[i] = PQCLEAN_MCELIECE348864_CLEAN_eval(f, L[i]); + } +} + diff --git a/crypto_kem/mceliece348864/clean/root.h b/crypto_kem/mceliece348864/clean/root.h new file mode 100644 index 00000000..6b125234 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/root.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_ROOT_H +#define PQCLEAN_MCELIECE348864_CLEAN_ROOT_H +/* + This file is for evaluating a polynomial at one or more field elements +*/ + + +#include "gf.h" + +gf PQCLEAN_MCELIECE348864_CLEAN_eval(gf * /*f*/, gf /*a*/); +void PQCLEAN_MCELIECE348864_CLEAN_root(gf * /*out*/, gf * /*f*/, gf * /*L*/); + +#endif + diff --git a/crypto_kem/mceliece348864/clean/sk_gen.c b/crypto_kem/mceliece348864/clean/sk_gen.c new file mode 100644 index 00000000..d75075e3 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE348864_CLEAN_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE348864_CLEAN_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE348864_CLEAN_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE348864_CLEAN_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE348864_CLEAN_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864/clean/sk_gen.h b/crypto_kem/mceliece348864/clean/sk_gen.h new file mode 100644 index 00000000..6f1df9af --- /dev/null +++ b/crypto_kem/mceliece348864/clean/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_SK_GEN_H +#define PQCLEAN_MCELIECE348864_CLEAN_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE348864_CLEAN_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE348864_CLEAN_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece348864/clean/synd.c b/crypto_kem/mceliece348864/clean/synd.c new file mode 100644 index 00000000..d473bb1e --- /dev/null +++ b/crypto_kem/mceliece348864/clean/synd.c @@ -0,0 +1,33 @@ +/* + This file is for syndrome computation +*/ + +#include "synd.h" + +#include "params.h" +#include "root.h" + + +/* input: Goppa polynomial f, support L, received word r */ +/* output: out, the syndrome of length 2t */ +void PQCLEAN_MCELIECE348864_CLEAN_synd(gf *out, gf *f, gf *L, const unsigned char *r) { + int i, j; + gf e, e_inv, c; + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = 0; + } + + for (i = 0; i < SYS_N; i++) { + c = (r[i / 8] >> (i % 8)) & 1; + + e = PQCLEAN_MCELIECE348864_CLEAN_eval(f, L[i]); + e_inv = PQCLEAN_MCELIECE348864_CLEAN_gf_inv(PQCLEAN_MCELIECE348864_CLEAN_gf_mul(e, e)); + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = PQCLEAN_MCELIECE348864_CLEAN_gf_add(out[j], PQCLEAN_MCELIECE348864_CLEAN_gf_mul(e_inv, c)); + e_inv = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(e_inv, L[i]); + } + } +} + diff --git a/crypto_kem/mceliece348864/clean/synd.h b/crypto_kem/mceliece348864/clean/synd.h new file mode 100644 index 00000000..34b61bcd --- /dev/null +++ b/crypto_kem/mceliece348864/clean/synd.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_SYND_H +#define PQCLEAN_MCELIECE348864_CLEAN_SYND_H +/* + This file is for syndrome computation +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE348864_CLEAN_synd(gf * /*out*/, gf * /*f*/, gf * /*L*/, const unsigned char * /*r*/); + +#endif + diff --git a/crypto_kem/mceliece348864/clean/transpose.c b/crypto_kem/mceliece348864/clean/transpose.c new file mode 100644 index 00000000..cbad4f7b --- /dev/null +++ b/crypto_kem/mceliece348864/clean/transpose.c @@ -0,0 +1,42 @@ +/* + This file is for matrix transposition +*/ + +#include "transpose.h" + +#include + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + diff --git a/crypto_kem/mceliece348864/clean/transpose.h b/crypto_kem/mceliece348864/clean/transpose.h new file mode 100644 index 00000000..1bdc673d --- /dev/null +++ b/crypto_kem/mceliece348864/clean/transpose.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_TRANSPOSE_H +#define PQCLEAN_MCELIECE348864_CLEAN_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + +void PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(uint64_t * /*out*/, const uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece348864/clean/util.c b/crypto_kem/mceliece348864/clean/util.c new file mode 100644 index 00000000..75f1bc9c --- /dev/null +++ b/crypto_kem/mceliece348864/clean/util.c @@ -0,0 +1,67 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ + +#include "util.h" + +#include "params.h" + +void PQCLEAN_MCELIECE348864_CLEAN_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE348864_CLEAN_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE348864_CLEAN_load4(const unsigned char *in) { + int i; + uint32_t ret = in[3]; + + for (i = 2; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +void PQCLEAN_MCELIECE348864_CLEAN_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE348864_CLEAN_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE348864_CLEAN_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 4; +} + diff --git a/crypto_kem/mceliece348864/clean/util.h b/crypto_kem/mceliece348864/clean/util.h new file mode 100644 index 00000000..25b6f966 --- /dev/null +++ b/crypto_kem/mceliece348864/clean/util.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_UTIL_H +#define PQCLEAN_MCELIECE348864_CLEAN_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include + +void PQCLEAN_MCELIECE348864_CLEAN_store2(unsigned char * /*dest*/, gf /*a*/); +uint16_t PQCLEAN_MCELIECE348864_CLEAN_load2(const unsigned char * /*src*/); + +uint32_t PQCLEAN_MCELIECE348864_CLEAN_load4(const unsigned char * /*in*/); + +void PQCLEAN_MCELIECE348864_CLEAN_store8(unsigned char * /*out*/, uint64_t /*in*/); +uint64_t PQCLEAN_MCELIECE348864_CLEAN_load8(const unsigned char * /*in*/); + +gf PQCLEAN_MCELIECE348864_CLEAN_bitrev(gf /*a*/); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/LICENSE b/crypto_kem/mceliece348864/sse/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece348864/sse/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece348864/sse/Makefile b/crypto_kem/mceliece348864/sse/Makefile new file mode 100644 index 00000000..7fd83a5a --- /dev/null +++ b/crypto_kem/mceliece348864/sse/Makefile @@ -0,0 +1,41 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece348864_sse.a + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c operations.c pk_gen.c sk_gen.c transpose.c util.c \ + vec.c vec128.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S \ + transpose_64x64_asm.S update_asm.S vec128_mul_asm.S \ + vec_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h params.h \ + pk_gen.h sk_gen.h transpose.h util.h vec128.h vec.h \ + consts.inc powers.inc scalars_2x.inc scalars.inc + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o operations.o pk_gen.o transpose.o sk_gen.o util.o \ + vec.o vec128.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + transpose_64x64_asm.o update_asm.o vec128_mul_asm.o \ + vec_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mpopcnt -mbmi -msse4.1 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece348864/sse/aes256ctr.c b/crypto_kem/mceliece348864/sse/aes256ctr.c new file mode 100644 index 00000000..0a3fef52 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE348864_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece348864/sse/aes256ctr.h b/crypto_kem/mceliece348864/sse/aes256ctr.h new file mode 100644 index 00000000..9f62b86d --- /dev/null +++ b/crypto_kem/mceliece348864/sse/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_AES256CTR_H +#define PQCLEAN_MCELIECE348864_SSE_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE348864_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece348864/sse/api.h b/crypto_kem/mceliece348864/sse/api.h new file mode 100644 index 00000000..d834750e --- /dev/null +++ b/crypto_kem/mceliece348864/sse/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_API_H +#define PQCLEAN_MCELIECE348864_SSE_API_H + +#include + +#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_ALGNAME "Classic McEliece 348864" +#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_PUBLICKEYBYTES 261120 +#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_SECRETKEYBYTES 6452 +#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_CIPHERTEXTBYTES 128 +#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE348864_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE348864_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE348864_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/benes.c b/crypto_kem/mceliece348864/sse/benes.c new file mode 100644 index 00000000..d0bb0f1a --- /dev/null +++ b/crypto_kem/mceliece348864/sse/benes.c @@ -0,0 +1,287 @@ +/* + This file is for Benes network related functions +*/ +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_0(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = bs[ x ] ^ bs[ x + 1 ]; + diff &= *cond++; + bs[ x ] ^= diff; + bs[ x + 1 ] ^= diff; + } +} + +static void layer_1(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = bs[ x + 0 ] ^ bs[ x + 2 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 2 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 3 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 3 ] ^= diff; + + cond += 2; + } +} + +static void layer_2(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = bs[ x + 0 ] ^ bs[ x + 4 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 4 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 5 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 5 ] ^= diff; + + diff = bs[ x + 2 ] ^ bs[ x + 6 ]; + diff &= cond[2]; + bs[ x + 2 ] ^= diff; + bs[ x + 6 ] ^= diff; + + diff = bs[ x + 3 ] ^ bs[ x + 7 ]; + diff &= cond[3]; + bs[ x + 3 ] ^= diff; + bs[ x + 7 ] ^= diff; + + cond += 4; + } +} + +static void layer_3(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 8 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 8 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 9 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 9 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 10 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 10 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 11 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 11 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_4(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 16 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 16 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 17 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 17 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 18 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 18 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 19 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 19 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_5(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 32 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 32 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 33 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 33 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 34 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 34 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 35 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 35 ] ^= diff; + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: out, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE348864_SSE_load_bits(uint64_t out[][32], const unsigned char *bits) { + int i, low, block = 0; + + uint64_t cond[64]; + + // + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_SSE_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864_SSE_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 4; low >= 0; low--) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864_SSE_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 5; low >= 0; low--) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_SSE_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } +} + +/* input: r, sequence of bits to be permuted */ +/* cond, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE348864_SSE_benes(uint64_t *r, uint64_t cond[][32], int rev) { + int block, inc; + + uint64_t *bs = r; + + // + + if (rev == 0) { + block = 0; + inc = 1; + } else { + block = 22; + inc = -1; + } + + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs); + + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + //block += inc; + + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs); +} + diff --git a/crypto_kem/mceliece348864/sse/benes.h b/crypto_kem/mceliece348864/sse/benes.h new file mode 100644 index 00000000..267744be --- /dev/null +++ b/crypto_kem/mceliece348864/sse/benes.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_BENES_H +#define PQCLEAN_MCELIECE348864_SSE_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "gf.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864_SSE_load_bits(uint64_t /*out*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE348864_SSE_benes(uint64_t * /*r*/, uint64_t /*cond*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/bm.c b/crypto_kem/mceliece348864/sse/bm.c new file mode 100644 index 00000000..e3257e44 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/bm.c @@ -0,0 +1,220 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "util.h" +#include "vec.h" +#include "vec128.h" + +#include +#include + +extern void PQCLEAN_MCELIECE348864_SSE_update_asm(void *, gf, int); +extern gf PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm(uint64_t *); + +static inline uint64_t mask_nonzero(gf a) { + uint64_t ret = a; + + ret -= 1; + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline uint64_t mask_leq(uint16_t a, uint16_t b) { + uint64_t a_tmp = a; + uint64_t b_tmp = b; + uint64_t ret = b_tmp - a_tmp; + + ret >>= 63; + ret -= 1; + + return ret; +} + +static void vec_cmov(uint64_t out[][2], uint64_t mask) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i][0] = (out[i][0] & ~mask) | (out[i][1] & mask); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE348864_SSE_vec128_or(PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE348864_SSE_vec128_sll_2x(PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE348864_SSE_vec128_or(PQCLEAN_MCELIECE348864_SSE_vec128_srl_2x(PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < GFBITS; i++) { + buf[i] = in[i]; + } + for (i = GFBITS; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE348864_SSE_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_SSE_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_SSE_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE348864_SSE_bm(uint64_t out[ GFBITS ], vec128 in[ GFBITS ]) { + uint16_t i; + uint16_t N, L; + + uint64_t prod[ GFBITS ]; + uint64_t in_tmp[ GFBITS ]; + + uint64_t db[ GFBITS ][ 2 ]; + uint64_t BC_tmp[ GFBITS ][ 2 ]; + uint64_t BC[ GFBITS ][ 2 ]; + + uint64_t mask, t; + + gf d, b, c0 = 1; + + gf coefs[SYS_T * 2]; + + // init + + BC[0][1] = 0; + BC[0][0] = 1; + BC[0][0] <<= 63; + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = 0; + } + + b = 1; + L = 0; + + // + + get_coefs(coefs, in); + + for (i = 0; i < GFBITS; i++) { + in_tmp[i] = 0; + } + + for (N = 0; N < SYS_T * 2; N++) { + // computing d + + PQCLEAN_MCELIECE348864_SSE_vec_mul_asm(prod, in_tmp, &BC[0][1], 16); + + PQCLEAN_MCELIECE348864_SSE_update_asm(in_tmp, coefs[N], 8); + + d = PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE348864_SSE_gf_mul2(c0, coefs[N], b); + + d ^= t & 0xFFFFFFFF; + + // 3 cases + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = (d >> i) & 1; + db[i][0] = -db[i][0]; + db[i][1] = (b >> i) & 1; + db[i][1] = -db[i][1]; + } + + PQCLEAN_MCELIECE348864_SSE_vec128_mul((vec128 *) BC_tmp, (vec128 *) db, (vec128 *) BC); + + vec_cmov(BC, mask); + + PQCLEAN_MCELIECE348864_SSE_update_asm(BC, mask & c0, 16); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = BC_tmp[i][0] ^ BC_tmp[i][1]; + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + + } + + c0 = PQCLEAN_MCELIECE348864_SSE_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + out[i] = (c0 >> i) & 1; + out[i] = -out[i]; + } + + PQCLEAN_MCELIECE348864_SSE_vec_mul_asm(out, out, &BC[0][1], 16); +} + diff --git a/crypto_kem/mceliece348864/sse/bm.h b/crypto_kem/mceliece348864/sse/bm.h new file mode 100644 index 00000000..9430fe2d --- /dev/null +++ b/crypto_kem/mceliece348864/sse/bm.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_BM_H +#define PQCLEAN_MCELIECE348864_SSE_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864_SSE_bm(uint64_t *out, vec128 *in); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/consts.S b/crypto_kem/mceliece348864/sse/consts.S new file mode 100644 index 00000000..ff080b2f --- /dev/null +++ b/crypto_kem/mceliece348864/sse/consts.S @@ -0,0 +1,32 @@ +.data + +# not supported on macos +#.section .rodata +.globl PQCLEAN_MCELIECE348864_SSE_MASK0_0 +.globl PQCLEAN_MCELIECE348864_SSE_MASK0_1 +.globl PQCLEAN_MCELIECE348864_SSE_MASK1_0 +.globl PQCLEAN_MCELIECE348864_SSE_MASK1_1 +.globl PQCLEAN_MCELIECE348864_SSE_MASK2_0 +.globl PQCLEAN_MCELIECE348864_SSE_MASK2_1 +.globl PQCLEAN_MCELIECE348864_SSE_MASK3_0 +.globl PQCLEAN_MCELIECE348864_SSE_MASK3_1 +.globl PQCLEAN_MCELIECE348864_SSE_MASK4_0 +.globl PQCLEAN_MCELIECE348864_SSE_MASK4_1 +.globl PQCLEAN_MCELIECE348864_SSE_MASK5_0 +.globl PQCLEAN_MCELIECE348864_SSE_MASK5_1 + +.p2align 4 + +PQCLEAN_MCELIECE348864_SSE_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE348864_SSE_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE348864_SSE_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE348864_SSE_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE348864_SSE_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE348864_SSE_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE348864_SSE_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE348864_SSE_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE348864_SSE_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE348864_SSE_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE348864_SSE_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE348864_SSE_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece348864/sse/consts.inc b/crypto_kem/mceliece348864/sse/consts.inc new file mode 100644 index 00000000..87b50f73 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/consts.inc @@ -0,0 +1,448 @@ +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33333333CCCCCCCC, 0x33333333CCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF0000FF00FFFF00, 0xFF0000FF00FFFF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC33333333CCCC, 0x3333CCCCCCCC3333), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CC33CCCC33CC33, 0x33CC33CCCC33CC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF00FF0000FF, 0x00FFFF00FF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCCCCCC3333, 0xCCCC33333333CCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, diff --git a/crypto_kem/mceliece348864/sse/controlbits.c b/crypto_kem/mceliece348864/sse/controlbits.c new file mode 100644 index 00000000..0908baf7 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE348864_SSE_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE348864_SSE_sort_63b(n / 2, x); + PQCLEAN_MCELIECE348864_SSE_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE348864_SSE_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece348864/sse/controlbits.h b/crypto_kem/mceliece348864/sse/controlbits.h new file mode 100644 index 00000000..b32ba7b7 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_CONTROLBITS_H +#define PQCLEAN_MCELIECE348864_SSE_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE348864_SSE_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE348864_SSE_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/crypto_hash.h b/crypto_kem/mceliece348864/sse/crypto_hash.h new file mode 100644 index 00000000..c69e5f3c --- /dev/null +++ b/crypto_kem/mceliece348864/sse/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece348864/sse/decrypt.c b/crypto_kem/mceliece348864/sse/decrypt.c new file mode 100644 index 00000000..653bd005 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/decrypt.c @@ -0,0 +1,203 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec128 out[][GFBITS], vec128 inv[][GFBITS], const unsigned char *sk, vec128 *recv) { + int i, j; + + uint64_t irr_int[ GFBITS ]; + vec128 eval[32][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE348864_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE348864_SSE_fft(eval, irr_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE348864_SSE_vec128_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE348864_SSE_vec128_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864_SSE_vec128_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864_SSE_vec128_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + uint8_t r[ 512 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 512; i++) { + r[i] = 0; + } + + for (i = 0; i < 32; i++) { + recv[i] = PQCLEAN_MCELIECE348864_SSE_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE348864_SSE_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE348864_SSE_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec128 out[][GFBITS], vec128 inv[][GFBITS], vec128 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 32; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864_SSE_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864_SSE_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u32( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint64_t synd_cmp(vec128 s0[ GFBITS ], vec128 s1[ GFBITS ]) { + int i; + vec128 diff; + + diff = PQCLEAN_MCELIECE348864_SSE_vec128_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE348864_SSE_vec128_or(diff, PQCLEAN_MCELIECE348864_SSE_vec128_xor(s0[i], s1[i])); + } + + return PQCLEAN_MCELIECE348864_SSE_vec128_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE348864_SSE_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec128 inv[ 32 ][ GFBITS ]; + vec128 scaled[ 32 ][ GFBITS ]; + vec128 eval[ 32 ][ GFBITS ]; + + vec128 error[ 32 ]; + + vec128 s_priv[ GFBITS ]; + vec128 s_priv_cmp[ GFBITS ]; + + uint64_t locator[ GFBITS ]; + + vec128 recv[ 32 ]; + vec128 allone; + + uint64_t bits_int[23][32]; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE348864_SSE_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE348864_SSE_benes((uint64_t *) recv, bits_int, 1); + + scaling(scaled, inv, sk, recv); + + PQCLEAN_MCELIECE348864_SSE_fft_tr(s_priv, scaled); + + PQCLEAN_MCELIECE348864_SSE_bm(locator, s_priv); + + PQCLEAN_MCELIECE348864_SSE_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE348864_SSE_vec128_setbits(1); + + for (i = 0; i < 32; i++) { + error[i] = PQCLEAN_MCELIECE348864_SSE_vec128_or_reduce(eval[i]); + error[i] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(error[i], allone); + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE348864_SSE_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE348864_SSE_benes((uint64_t *) error, bits_int, 0); + + postprocess(e, error); + + check_weight = weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece348864/sse/decrypt.h b/crypto_kem/mceliece348864/sse/decrypt.h new file mode 100644 index 00000000..91fa2f4b --- /dev/null +++ b/crypto_kem/mceliece348864/sse/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_DECRYPT_H +#define PQCLEAN_MCELIECE348864_SSE_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE348864_SSE_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/encrypt.c b/crypto_kem/mceliece348864/sse/encrypt.c new file mode 100644 index 00000000..ca5fa765 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/encrypt.c @@ -0,0 +1,99 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "gf.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE348864_SSE_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint16_t ind[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *)ind_, sizeof(ind_)); + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE348864_SSE_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864_SSE_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE348864_SSE_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece348864/sse/encrypt.h b/crypto_kem/mceliece348864/sse/encrypt.h new file mode 100644 index 00000000..bdc07823 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_ENCRYPT_H +#define PQCLEAN_MCELIECE348864_SSE_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE348864_SSE_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/fft.c b/crypto_kem/mceliece348864/sse/fft.c new file mode 100644 index 00000000..c8b1b23f --- /dev/null +++ b/crypto_kem/mceliece348864/sse/fft.c @@ -0,0 +1,155 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "vec.h" +#include "vec128.h" + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(uint64_t *in) { + int i, j, k; + + const uint64_t mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const uint64_t s[5][GFBITS] = { +#include "scalars.inc" + }; + + // + + for (j = 0; j <= 4; j++) { + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[i] ^= (in[i] & mask[k][0]) >> (1 << k); + in[i] ^= (in[i] & mask[k][1]) >> (1 << k); + } + } + + PQCLEAN_MCELIECE348864_SSE_vec_mul(in, in, s[j]); // scaling + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec128 out[][ GFBITS ], const uint64_t *in) { + int i, j, k, s, b; + + uint64_t t0, t1; + + const vec128 consts[ 32 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 0; + + const uint8_t reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + // boradcast + + vec128 tmp[ GFBITS ]; + vec128 x[ GFBITS ], y[ GFBITS ]; + + for (j = 0; j < 64; j += 4) { + for (i = 0; i < GFBITS; i++) { + t0 = (in[i] >> reversal[j + 0]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 2]) & 1; + t1 = -t1; + + out[j / 2 + 0][i] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(t0, t1); + + t0 = (in[i] >> reversal[j + 1]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 3]) & 1; + t1 = -t1; + + out[j / 2 + 1][i] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(t0, t1); + } + } + + // + + + for (i = 0; i < 32; i += 2) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tmp, out[i + 1], consts[ 0 ]); + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] ^= out[i + 0][b]; + } + + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864_SSE_vec128_unpack_low(out[i + 0][b], out[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864_SSE_vec128_unpack_high(out[i + 0][b], out[i + 1][b]); + } + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] = x[b]; + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] = y[b]; + } + } + + consts_ptr += 1; + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += s; + } + + // adding the part contributed by x^64 + + vec128 powers[32][GFBITS] = { +#include "powers.inc" + }; + + for (i = 0; i < 32; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] ^= powers[i][b]; + } + } +} + +void PQCLEAN_MCELIECE348864_SSE_fft(vec128 out[][ GFBITS ], uint64_t *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece348864/sse/fft.h b/crypto_kem/mceliece348864/sse/fft.h new file mode 100644 index 00000000..4fecb38b --- /dev/null +++ b/crypto_kem/mceliece348864/sse/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_FFT_H +#define PQCLEAN_MCELIECE348864_SSE_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864_SSE_fft(vec128 /*out*/[][GFBITS], uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/fft_tr.c b/crypto_kem/mceliece348864/sse/fft_tr.c new file mode 100644 index 00000000..74c79430 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/fft_tr.c @@ -0,0 +1,312 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" +#include "vec.h" +#include "vec128.h" + +#include + +static void radix_conversions_tr(vec128 in[ GFBITS ]) { + int i, j, k; + + const vec128 mask[10] = { + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + uint64_t v0, v1; + + // + + for (j = 5; j >= 0; j--) { + + if (j < 5) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(in, in, s[j]); + } + + for (i = 0; i < GFBITS; i++) { + for (k = j; k <= 4; k++) { + in[i] ^= PQCLEAN_MCELIECE348864_SSE_vec128_sll_2x(in[i] & mask[2 * k + 0], 1 << k); + in[i] ^= PQCLEAN_MCELIECE348864_SSE_vec128_sll_2x(in[i] & mask[2 * k + 1], 1 << k); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[i], 0); + v1 = PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[i], 1); + + v1 ^= v0 >> 32; + v1 ^= v1 << 32; + + in[i] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(v0, v1); + } + } +} + +static void butterflies_tr(vec128 out[ GFBITS ], vec128 in[][ GFBITS ]) { + int i, j, k, s, b; + + uint64_t t[ GFBITS ]; + uint64_t pre[6][ GFBITS ]; + + uint64_t out64[2][GFBITS]; + + vec128 p2[ 6 ]; + vec128 buf[64]; + vec128 tt[ GFBITS ]; + vec128 x[ GFBITS ], y[ GFBITS ]; + + const vec128 consts[ 32 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 32; + + const uint8_t reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {8, 1300, 3408, 1354, 2341, 1154}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tt, in[k], consts[ consts_ptr + (k - j) ]); + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tt[b]; + } + } + } + } + + for (i = 0; i < 32; i += 2) { + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864_SSE_vec128_unpack_low(in[i + 0][b], in[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864_SSE_vec128_unpack_high(in[i + 0][b], in[i + 1][b]); + } + + for (b = 0; b < GFBITS; b++) { + in[i + 0][b] = x[b] ^ y[b]; + } + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tt, in[i + 0], consts[ 0 ]); + for (b = 0; b < GFBITS; b++) { + in[i + 1][b] = y[b] ^ tt[b]; + } + } + + // transpose + + for (i = 0; i < GFBITS; i += 2) { + for (j = 0; j < 64; j += 4) { + buf[ reversal[j + 0] ] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 0][i + 0], 0), + PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 0][i + 1], 0)); + buf[ reversal[j + 1] ] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 1][i + 0], 0), + PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 1][i + 1], 0)); + buf[ reversal[j + 2] ] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 0][i + 0], 1), + PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 0][i + 1], 1)); + buf[ reversal[j + 3] ] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 1][i + 0], 1), + PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 1][i + 1], 1)); + } + + PQCLEAN_MCELIECE348864_SSE_transpose_64x128_sp(buf); + + p2[0] = buf[32]; + buf[33] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[33], buf[32]); + p2[1] = buf[33]; + buf[35] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[35], buf[33]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[35]); + buf[34] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[34], buf[35]); + p2[2] = buf[34]; + buf[38] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[38], buf[34]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[38]); + buf[39] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[39], buf[38]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[39]); + buf[37] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[37], buf[39]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[37]); + buf[36] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[36], buf[37]); + p2[3] = buf[36]; + buf[44] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[44], buf[36]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[44]); + buf[45] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[45], buf[44]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[45]); + buf[47] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[47], buf[45]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[47]); + buf[46] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[46], buf[47]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[46]); + buf[42] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[42], buf[46]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[42]); + buf[43] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[43], buf[42]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[43]); + buf[41] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[41], buf[43]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[41]); + buf[40] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[40], buf[41]); + p2[4] = buf[40]; + buf[56] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[56], buf[40]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[56]); + buf[57] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[57], buf[56]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[57]); + buf[59] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[59], buf[57]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[59]); + buf[58] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[58], buf[59]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[58]); + buf[62] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[62], buf[58]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[62]); + buf[63] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[63], buf[62]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[63]); + buf[61] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[61], buf[63]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[61]); + buf[60] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[60], buf[61]); + p2[3] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[3], buf[60]); + buf[52] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[52], buf[60]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[52]); + buf[53] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[53], buf[52]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[53]); + buf[55] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[55], buf[53]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[55]); + buf[54] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[54], buf[55]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[54]); + buf[50] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[50], buf[54]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[50]); + buf[51] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[51], buf[50]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[51]); + buf[49] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[49], buf[51]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[49]); + buf[48] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[48], buf[49]); + p2[5] = buf[48]; + buf[16] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[16], buf[48]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[16]); + buf[17] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[17], buf[16]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[17]); + buf[19] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[19], buf[17]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[19]); + buf[18] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[18], buf[19]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[18]); + buf[22] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[22], buf[18]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[22]); + buf[23] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[23], buf[22]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[23]); + buf[21] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[21], buf[23]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[21]); + buf[20] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[20], buf[21]); + p2[3] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[3], buf[20]); + buf[28] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[28], buf[20]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[28]); + buf[29] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[29], buf[28]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[29]); + buf[31] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[31], buf[29]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[31]); + buf[30] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[30], buf[31]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[30]); + buf[26] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[26], buf[30]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[26]); + buf[27] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[27], buf[26]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[27]); + buf[25] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[25], buf[27]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[25]); + buf[24] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[24], buf[25]); + p2[4] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[4], buf[24]); + buf[8] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[8], buf[24]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[8]); + buf[9] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[9], buf[8]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[9]); + buf[11] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[11], buf[9]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[11]); + buf[10] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[10], buf[11]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[10]); + buf[14] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[14], buf[10]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[14]); + buf[15] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[15], buf[14]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[15]); + buf[13] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[13], buf[15]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[13]); + buf[12] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[12], buf[13]); + p2[3] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[3], buf[12]); + buf[4] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[4], buf[12]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[4]); + buf[5] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[5], buf[4]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[5]); + buf[7] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[7], buf[5]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[7]); + buf[6] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[6], buf[7]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[6]); + buf[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[2], buf[6]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[2]); + buf[3] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[3], buf[2]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[3]); + buf[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[1], buf[3]); + + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[1]); + buf[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[0], buf[1]); + + for (j = 0; j < 6; j++) { + pre[j][i + 0] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(p2[j], 0); + pre[j][i + 1] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(p2[j], 1); + } + + out64[0][i + 0] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(buf[0], 0); + out64[0][i + 1] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(buf[0], 1); + } + + // + + for (j = 0; j < GFBITS; j++) { + t[j] = (beta[0] >> j) & 1; + t[j] = -t[j]; + } + + PQCLEAN_MCELIECE348864_SSE_vec_mul(out64[1], pre[0], t); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + t[j] = (beta[i] >> j) & 1; + t[j] = -t[j]; + } + + PQCLEAN_MCELIECE348864_SSE_vec_mul(t, pre[i], t); + PQCLEAN_MCELIECE348864_SSE_vec_add(out64[1], out64[1], t); + } + + for (b = 0; b < GFBITS; b++) { + out[b] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(out64[0][b], out64[1][b]); + } +} + +void PQCLEAN_MCELIECE348864_SSE_fft_tr(vec128 out[GFBITS], vec128 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece348864/sse/fft_tr.h b/crypto_kem/mceliece348864/sse/fft_tr.h new file mode 100644 index 00000000..92445640 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/fft_tr.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_FFT_TR_H +#define PQCLEAN_MCELIECE348864_SSE_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864_SSE_fft_tr(vec128 /*out*/[GFBITS], vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/gf.c b/crypto_kem/mceliece348864/sse/gf.c new file mode 100644 index 00000000..fa5f25d6 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/gf.c @@ -0,0 +1,169 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE348864_SSE_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 20; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE348864_SSE_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE348864_SSE_gf_mul(gf in0, gf in1) { + int i; + + uint32_t tmp; + uint32_t t0; + uint32_t t1; + uint32_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & (1 << i))); + } + + t = tmp & 0x7FC000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + t = tmp & 0x3000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + return tmp & ((1 << GFBITS) - 1); +} + +/* input: field element in */ +/* return: in^2 */ +static inline gf gf_sq(gf in) { + const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; + + uint32_t x = in; + uint32_t t; + + x = (x | (x << 8)) & B[3]; + x = (x | (x << 4)) & B[2]; + x = (x | (x << 2)) & B[1]; + x = (x | (x << 1)) & B[0]; + + t = x & 0x7FC000; + x ^= t >> 9; + x ^= t >> 12; + + t = x & 0x3000; + x ^= t >> 9; + x ^= t >> 12; + + return x & ((1 << GFBITS) - 1); +} + +gf PQCLEAN_MCELIECE348864_SSE_gf_inv(gf in) { + gf tmp_11; + gf tmp_1111; + + gf out = in; + + out = gf_sq(out); + tmp_11 = PQCLEAN_MCELIECE348864_SSE_gf_mul(out, in); // 11 + + out = gf_sq(tmp_11); + out = gf_sq(out); + tmp_1111 = PQCLEAN_MCELIECE348864_SSE_gf_mul(out, tmp_11); // 1111 + + out = gf_sq(tmp_1111); + out = gf_sq(out); + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_SSE_gf_mul(out, tmp_1111); // 11111111 + + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_SSE_gf_mul(out, tmp_11); // 1111111111 + + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_SSE_gf_mul(out, in); // 11111111111 + + return gf_sq(out); // 111111111110 +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE348864_SSE_gf_frac(gf den, gf num) { + return PQCLEAN_MCELIECE348864_SSE_gf_mul(PQCLEAN_MCELIECE348864_SSE_gf_inv(den), num); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE348864_SSE_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE348864_SSE_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864_SSE_gf_mul(prod[i], (gf) 877); + prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864_SSE_gf_mul(prod[i], (gf) 2888); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864_SSE_gf_mul(prod[i], (gf) 1781); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864_SSE_gf_mul(prod[i], (gf) 373); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864_SSE_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x007FC000007FC000; + tmp ^= (t >> 9) ^ (t >> 12); + + t = tmp & 0x0000300000003000; + tmp ^= (t >> 9) ^ (t >> 12); + + return tmp & 0x00000FFF00000FFF; +} + diff --git a/crypto_kem/mceliece348864/sse/gf.h b/crypto_kem/mceliece348864/sse/gf.h new file mode 100644 index 00000000..8b3254d6 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/gf.h @@ -0,0 +1,26 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_GF_H +#define PQCLEAN_MCELIECE348864_SSE_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE348864_SSE_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE348864_SSE_gf_add(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864_SSE_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864_SSE_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE348864_SSE_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE348864_SSE_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864_SSE_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/operations.c b/crypto_kem/mceliece348864/sse/operations.c new file mode 100644 index 00000000..fe43ab23 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE348864_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE348864_SSE_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864_SSE_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE348864_SSE_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE348864_SSE_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE348864_SSE_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE348864_SSE_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE348864_SSE_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE348864_SSE_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE348864_SSE_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE348864_SSE_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864/sse/params.h b/crypto_kem/mceliece348864/sse/params.h new file mode 100644 index 00000000..1b23043c --- /dev/null +++ b/crypto_kem/mceliece348864/sse/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_PARAMS_H +#define PQCLEAN_MCELIECE348864_SSE_PARAMS_H + +#define GFBITS 12 +#define SYS_N 3488 +#define SYS_T 64 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece348864/sse/pk_gen.c b/crypto_kem/mceliece348864/sse/pk_gen.c new file mode 100644 index 00000000..0d0097c3 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/pk_gen.c @@ -0,0 +1,329 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +#include + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec128 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 128 + 0 * 64 + r] <<= 1; + out[i * 128 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 128 + 1 * 64 + r] <<= 1; + out[i * 128 + 1 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec128 out0[][GFBITS], vec128 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[2] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(u[0], u[1]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(u[0], u[1]); + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 127) / 128) * 2 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << 32 >> 32) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> 32 << 32) | (buf[j] >> 32); + } + } + + return 0; +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 127) / 128) +#define NBLOCKS_I ((GFBITS * SYS_T + 63) / 64) + +int PQCLEAN_MCELIECE348864_SSE_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 2 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS_I ]; + + uint64_t mask; + + uint64_t irr_int[ GFBITS ]; + + vec128 consts[32][ GFBITS ]; + vec128 eval[ 32 ][ GFBITS ]; + vec128 prod[ 32 ][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE348864_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE348864_SSE_fft(eval, irr_int); + + PQCLEAN_MCELIECE348864_SSE_vec128_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864_SSE_vec128_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864_SSE_vec128_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE348864_SSE_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + // gaussian elimination + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = NBLOCKS_I; j < NBLOCKS1_H - 1; j++) { + PQCLEAN_MCELIECE348864_SSE_store8(pk, mat[i][j]); + pk += 8; + } + + PQCLEAN_MCELIECE348864_SSE_store_i(pk, mat[i][j], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece348864/sse/pk_gen.h b/crypto_kem/mceliece348864/sse/pk_gen.h new file mode 100644 index 00000000..e54b9e6f --- /dev/null +++ b/crypto_kem/mceliece348864/sse/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_PK_GEN_H +#define PQCLEAN_MCELIECE348864_SSE_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include "gf.h" + +int PQCLEAN_MCELIECE348864_SSE_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/powers.inc b/crypto_kem/mceliece348864/sse/powers.inc new file mode 100644 index 00000000..8e15bd37 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/powers.inc @@ -0,0 +1,448 @@ +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, diff --git a/crypto_kem/mceliece348864/sse/scalars.inc b/crypto_kem/mceliece348864/sse/scalars.inc new file mode 100644 index 00000000..aa8f64b9 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/scalars.inc @@ -0,0 +1,70 @@ +{ + 0XF3CFC030FC30F003, + 0X3FCF0F003C00C00C, + 0X30033CC300C0C03C, + 0XCCFF0F3C0F30F0C0, + 0X0300C03FF303C3F0, + 0X3FFF3C0FF0CCCCC0, + 0XF3FFF0C00F3C3CC0, + 0X3003333FFFC3C000, + 0X0FF30FFFC3FFF300, + 0XFFC0F300F0F0CC00, + 0XC0CFF3FCCC3CFC00, + 0XFC3C03F0F330C000, +}, +{ + 0X000F00000000F00F, + 0X00000F00F00000F0, + 0X0F00000F00000F00, + 0XF00F00F00F000000, + 0X00F00000000000F0, + 0X0000000F00000000, + 0XF00000000F00F000, + 0X00F00F00000F0000, + 0X0000F00000F00F00, + 0X000F00F00F00F000, + 0X00F00F0000000000, + 0X0000000000F00000, +}, +{ + 0X0000FF00FF0000FF, + 0X0000FF000000FF00, + 0XFF0000FF00FF0000, + 0XFFFF0000FF000000, + 0X00FF00FF00FF0000, + 0X0000FFFFFF000000, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF0000, + 0XFFFF00FFFF00FF00, + 0X0000FF0000000000, + 0XFFFFFF00FF000000, + 0X00FF000000000000, +}, +{ + 0X000000000000FFFF, + 0X00000000FFFF0000, + 0X0000000000000000, + 0XFFFF000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +}, +{ + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +} diff --git a/crypto_kem/mceliece348864/sse/scalars_2x.inc b/crypto_kem/mceliece348864/sse/scalars_2x.inc new file mode 100644 index 00000000..8eb78032 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/scalars_2x.inc @@ -0,0 +1,70 @@ +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xf3cfc030fc30f003, 0x000c03c0c3c0330c), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3fcf0f003c00c00c, 0xf330cffcc00f33c0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x30033cc300c0c03c, 0xccf330f00f3c0333), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xccff0f3c0f30f0c0, 0xff03fff3ff0cf0c0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0300c03ff303c3f0, 0x3cc3fcf00fcc303c), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3fff3c0ff0ccccc0, 0x0f000c0fc30303f3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xf3fff0c00f3c3cc0, 0xcf0fc3ff333ccf3c), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3003333fffc3c000, 0x003f3fc3c0ff333f), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0ff30fffc3fff300, 0x3cc3f0f3cf0ff00f), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffc0f300f0f0cc00, 0xf3f33cc03fc30cc0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xc0cff3fccc3cfc00, 0x3cc330cfc333f33f), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xfc3c03f0f330c000, 0x3cc0303ff3c3fffc), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x000f00000000f00f, 0x0f00f00f00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00000f00f00000f0, 0xf00000000000f000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0f00000f00000f00, 0x00000f00000000f0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xf00f00f00f000000, 0x0f00f00000f00000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00f00000000000f0, 0x000f00000f00f00f), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000f00000000, 0x00f00f00f00f0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xf00000000f00f000, 0x0f00f00000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00f00f00000f0000, 0x000000000f000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000f00000f00f00, 0x00f00000000f00f0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x000f00f00f00f000, 0x0000f00f00000f00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00f00f0000000000, 0xf00000f00000f00f), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000f00000, 0x00000f00f00f00f0), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000ff00ff0000ff, 0xff00ffffff000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000ff000000ff00, 0xff0000ffff000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xff0000ff00ff0000, 0xffff00ffff000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffff0000ff000000, 0xff00ffffffffff00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00ff00ff00ff0000, 0x00000000ff00ff00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000ffffff000000, 0xffffffff00ff0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00ffff00ff000000, 0x00ffffff00ff0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffffff0000ff0000, 0xffff00ffff00ffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffff00ffff00ff00, 0xffff0000ffffffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000ff0000000000, 0xff00000000ff0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffffff00ff000000, 0x000000ff00ff00ff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00ff000000000000, 0x00ff00ff00ffff00), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x000000000000ffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00000000ffff0000, 0xffff000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffff000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000ffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000ffff00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000ffff00000000, 0x00000000ffff0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x00000000ffff0000), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00000000ffffffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffffffff00000000, 0x00000000ffffffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffffffff00000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffffffff00000000), +}, diff --git a/crypto_kem/mceliece348864/sse/sk_gen.c b/crypto_kem/mceliece348864/sse/sk_gen.c new file mode 100644 index 00000000..255182fb --- /dev/null +++ b/crypto_kem/mceliece348864/sse/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE348864_SSE_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE348864_SSE_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE348864_SSE_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE348864_SSE_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE348864_SSE_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864_SSE_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE348864_SSE_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE348864_SSE_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864/sse/sk_gen.h b/crypto_kem/mceliece348864/sse/sk_gen.h new file mode 100644 index 00000000..ca96519e --- /dev/null +++ b/crypto_kem/mceliece348864/sse/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_SK_GEN_H +#define PQCLEAN_MCELIECE348864_SSE_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE348864_SSE_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE348864_SSE_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/syndrome_asm.S b/crypto_kem/mceliece348864/sse/syndrome_asm.S new file mode 100644 index 00000000..66c9efec --- /dev/null +++ b/crypto_kem/mceliece348864/sse/syndrome_asm.S @@ -0,0 +1,740 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg128 pp + +# qhasm: reg128 ee + +# qhasm: reg128 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack128 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_SSE_syndrome_asm +.global PQCLEAN_MCELIECE348864_SSE_syndrome_asm +_PQCLEAN_MCELIECE348864_SSE_syndrome_asm: +PQCLEAN_MCELIECE348864_SSE_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 260780 +# asm 1: add $260780,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 768 +# asm 1: mov $768,>row=int64#5 +# asm 2: mov $768,>row=%r8 +mov $768,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rsi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 96 ] +# asm 1: movdqu 96(ee=reg128#2 +# asm 2: movdqu 96(ee=%xmm1 +movdqu 96(%rdx),%xmm1 + +# qhasm: ss &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 16(pp=%xmm1 +movdqu 16(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 112 ] +# asm 1: movdqu 112(ee=reg128#3 +# asm 2: movdqu 112(ee=%xmm2 +movdqu 112(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 32(pp=%xmm1 +movdqu 32(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 128 ] +# asm 1: movdqu 128(ee=reg128#3 +# asm 2: movdqu 128(ee=%xmm2 +movdqu 128(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 48(pp=%xmm1 +movdqu 48(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 144 ] +# asm 1: movdqu 144(ee=reg128#3 +# asm 2: movdqu 144(ee=%xmm2 +movdqu 144(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 64(pp=%xmm1 +movdqu 64(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 160 ] +# asm 1: movdqu 160(ee=reg128#3 +# asm 2: movdqu 160(ee=%xmm2 +movdqu 160(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 80(pp=%xmm1 +movdqu 80(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 176 ] +# asm 1: movdqu 176(ee=reg128#3 +# asm 2: movdqu 176(ee=%xmm2 +movdqu 176(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 96(pp=%xmm1 +movdqu 96(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 192 ] +# asm 1: movdqu 192(ee=reg128#3 +# asm 2: movdqu 192(ee=%xmm2 +movdqu 192(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 112(pp=%xmm1 +movdqu 112(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 208 ] +# asm 1: movdqu 208(ee=reg128#3 +# asm 2: movdqu 208(ee=%xmm2 +movdqu 208(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 128(pp=%xmm1 +movdqu 128(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 224 ] +# asm 1: movdqu 224(ee=reg128#3 +# asm 2: movdqu 224(ee=%xmm2 +movdqu 224(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 144(pp=%xmm1 +movdqu 144(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 240 ] +# asm 1: movdqu 240(ee=reg128#3 +# asm 2: movdqu 240(ee=%xmm2 +movdqu 240(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 160(pp=%xmm1 +movdqu 160(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 256 ] +# asm 1: movdqu 256(ee=reg128#3 +# asm 2: movdqu 256(ee=%xmm2 +movdqu 256(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 176(pp=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 272 ] +# asm 1: movdqu 272(ee=reg128#3 +# asm 2: movdqu 272(ee=%xmm2 +movdqu 272(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 192(pp=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 288 ] +# asm 1: movdqu 288(ee=reg128#3 +# asm 2: movdqu 288(ee=%xmm2 +movdqu 288(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 208(pp=%xmm1 +movdqu 208(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 304 ] +# asm 1: movdqu 304(ee=reg128#3 +# asm 2: movdqu 304(ee=%xmm2 +movdqu 304(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 224(pp=%xmm1 +movdqu 224(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 320 ] +# asm 1: movdqu 320(ee=reg128#3 +# asm 2: movdqu 320(ee=%xmm2 +movdqu 320(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 240(pp=%xmm1 +movdqu 240(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 336 ] +# asm 1: movdqu 336(ee=reg128#3 +# asm 2: movdqu 336(ee=%xmm2 +movdqu 336(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 256(pp=%xmm1 +movdqu 256(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 352 ] +# asm 1: movdqu 352(ee=reg128#3 +# asm 2: movdqu 352(ee=%xmm2 +movdqu 352(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 272(pp=%xmm1 +movdqu 272(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 368 ] +# asm 1: movdqu 368(ee=reg128#3 +# asm 2: movdqu 368(ee=%xmm2 +movdqu 368(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 288(pp=%xmm1 +movdqu 288(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 384 ] +# asm 1: movdqu 384(ee=reg128#3 +# asm 2: movdqu 384(ee=%xmm2 +movdqu 384(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 304(pp=%xmm1 +movdqu 304(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 400 ] +# asm 1: movdqu 400(ee=reg128#3 +# asm 2: movdqu 400(ee=%xmm2 +movdqu 400(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 320(pp=%xmm1 +movdqu 320(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 416 ] +# asm 1: movdqu 416(ee=reg128#3 +# asm 2: movdqu 416(ee=%xmm2 +movdqu 416(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand buf=stack128#1 +# asm 2: movdqa buf=0(%rsp) +movdqa %xmm0,0(%rsp) + +# qhasm: s = *(uint32 *)(input_1 + 336) +# asm 1: movl 336(s=int64#6d +# asm 2: movl 336(s=%r9d +movl 336(%rsi),%r9d + +# qhasm: e = *(uint32 *)(input_2 + 432) +# asm 1: movl 432(e=int64#7d +# asm 2: movl 432(e=%eax +movl 432(%rdx),%eax + +# qhasm: s &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(ee=reg128#2 +# asm 2: movdqu 0(ee=%xmm1 +movdqu 0(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 16(ss=%xmm0 +movdqu 16(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 16 ] +# asm 1: movdqu 16(ee=reg128#2 +# asm 2: movdqu 16(ee=%xmm1 +movdqu 16(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 32(ss=%xmm0 +movdqu 32(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 32 ] +# asm 1: movdqu 32(ee=reg128#2 +# asm 2: movdqu 32(ee=%xmm1 +movdqu 32(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 48(ss=%xmm0 +movdqu 48(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 48 ] +# asm 1: movdqu 48(ee=reg128#2 +# asm 2: movdqu 48(ee=%xmm1 +movdqu 48(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 64(ss=%xmm0 +movdqu 64(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 64 ] +# asm 1: movdqu 64(ee=reg128#2 +# asm 2: movdqu 64(ee=%xmm1 +movdqu 64(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 80(ss=%xmm0 +movdqu 80(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 80 ] +# asm 1: movdqu 80(ee=reg128#2 +# asm 2: movdqu 80(ee=%xmm1 +movdqu 80(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor + +void PQCLEAN_MCELIECE348864_SSE_transpose_64x64(uint64_t *in); +void PQCLEAN_MCELIECE348864_SSE_transpose_64x128_sp(vec128 *in); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/transpose_64x128_sp_asm.S b/crypto_kem/mceliece348864/sse/transpose_64x128_sp_asm.S new file mode 100644 index 00000000..b3aae490 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/transpose_64x128_sp_asm.S @@ -0,0 +1,8145 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 x0 + +# qhasm: reg128 x1 + +# qhasm: reg128 x2 + +# qhasm: reg128 x3 + +# qhasm: reg128 x4 + +# qhasm: reg128 x5 + +# qhasm: reg128 x6 + +# qhasm: reg128 x7 + +# qhasm: reg128 t0 + +# qhasm: reg128 t1 + +# qhasm: reg128 v00 + +# qhasm: reg128 v01 + +# qhasm: reg128 v10 + +# qhasm: reg128 v11 + +# qhasm: reg128 mask0 + +# qhasm: reg128 mask1 + +# qhasm: reg128 mask2 + +# qhasm: reg128 mask3 + +# qhasm: reg128 mask4 + +# qhasm: reg128 mask5 + +# qhasm: enter transpose_64x128_sp_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_SSE_transpose_64x128_sp_asm +.global PQCLEAN_MCELIECE348864_SSE_transpose_64x128_sp_asm +_PQCLEAN_MCELIECE348864_SSE_transpose_64x128_sp_asm: +PQCLEAN_MCELIECE348864_SSE_transpose_64x128_sp_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: mask0 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK5_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_0(%rip),>mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 64 ] x2 +# asm 1: movddup 64(r1=reg128#8 +# asm 2: movddup 64(r1=%xmm7 +movddup 64(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 128 ] x2 +# asm 1: movddup 128(r2=reg128#9 +# asm 2: movddup 128(r2=%xmm8 +movddup 128(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 192 ] x2 +# asm 1: movddup 192(r3=reg128#10 +# asm 2: movddup 192(r3=%xmm9 +movddup 192(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 256 ] x2 +# asm 1: movddup 256(r4=reg128#11 +# asm 2: movddup 256(r4=%xmm10 +movddup 256(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 320 ] x2 +# asm 1: movddup 320(r5=reg128#12 +# asm 2: movddup 320(r5=%xmm11 +movddup 320(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 384 ] x2 +# asm 1: movddup 384(r6=reg128#13 +# asm 2: movddup 384(r6=%xmm12 +movddup 384(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 448 ] x2 +# asm 1: movddup 448(r7=reg128#14 +# asm 2: movddup 448(r7=%xmm13 +movddup 448(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 0 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 64 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 128 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 192 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 256 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 320 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 384 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 448 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 8(r0=%xmm6 +movddup 8(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r2=reg128#9 +# asm 2: movddup 136(r2=%xmm8 +movddup 136(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r3=reg128#10 +# asm 2: movddup 200(r3=%xmm9 +movddup 200(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r4=reg128#11 +# asm 2: movddup 264(r4=%xmm10 +movddup 264(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r5=reg128#12 +# asm 2: movddup 328(r5=%xmm11 +movddup 328(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r6=reg128#13 +# asm 2: movddup 392(r6=%xmm12 +movddup 392(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r7=reg128#14 +# asm 2: movddup 456(r7=%xmm13 +movddup 456(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 8 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 72 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 136 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 200 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 264 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 328 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 392 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 456 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 16(r0=%xmm6 +movddup 16(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r1=reg128#8 +# asm 2: movddup 80(r1=%xmm7 +movddup 80(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r3=reg128#10 +# asm 2: movddup 208(r3=%xmm9 +movddup 208(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r4=reg128#11 +# asm 2: movddup 272(r4=%xmm10 +movddup 272(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r5=reg128#12 +# asm 2: movddup 336(r5=%xmm11 +movddup 336(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r6=reg128#13 +# asm 2: movddup 400(r6=%xmm12 +movddup 400(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r7=reg128#14 +# asm 2: movddup 464(r7=%xmm13 +movddup 464(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 16 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 80 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 144 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 208 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 272 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 336 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 400 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 464 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 24(r0=%xmm6 +movddup 24(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r1=reg128#8 +# asm 2: movddup 88(r1=%xmm7 +movddup 88(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r2=reg128#9 +# asm 2: movddup 152(r2=%xmm8 +movddup 152(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r4=reg128#11 +# asm 2: movddup 280(r4=%xmm10 +movddup 280(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r5=reg128#12 +# asm 2: movddup 344(r5=%xmm11 +movddup 344(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r6=reg128#13 +# asm 2: movddup 408(r6=%xmm12 +movddup 408(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r7=reg128#14 +# asm 2: movddup 472(r7=%xmm13 +movddup 472(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 24 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 88 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 152 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 216 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 280 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 344 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 408 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 472 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 32(r0=%xmm6 +movddup 32(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r1=reg128#8 +# asm 2: movddup 96(r1=%xmm7 +movddup 96(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r2=reg128#9 +# asm 2: movddup 160(r2=%xmm8 +movddup 160(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r3=reg128#10 +# asm 2: movddup 224(r3=%xmm9 +movddup 224(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r5=reg128#12 +# asm 2: movddup 352(r5=%xmm11 +movddup 352(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r6=reg128#13 +# asm 2: movddup 416(r6=%xmm12 +movddup 416(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r7=reg128#14 +# asm 2: movddup 480(r7=%xmm13 +movddup 480(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 32 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 96 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 160 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 224 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 288 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 352 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 416 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 480 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 40(r0=%xmm6 +movddup 40(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r1=reg128#8 +# asm 2: movddup 104(r1=%xmm7 +movddup 104(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r2=reg128#9 +# asm 2: movddup 168(r2=%xmm8 +movddup 168(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r3=reg128#10 +# asm 2: movddup 232(r3=%xmm9 +movddup 232(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r4=reg128#11 +# asm 2: movddup 296(r4=%xmm10 +movddup 296(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r6=reg128#13 +# asm 2: movddup 424(r6=%xmm12 +movddup 424(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r7=reg128#14 +# asm 2: movddup 488(r7=%xmm13 +movddup 488(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 40 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 104 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 168 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 232 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 296 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 360 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 424 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 488 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 48(r0=%xmm6 +movddup 48(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r1=reg128#8 +# asm 2: movddup 112(r1=%xmm7 +movddup 112(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r2=reg128#9 +# asm 2: movddup 176(r2=%xmm8 +movddup 176(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r3=reg128#10 +# asm 2: movddup 240(r3=%xmm9 +movddup 240(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r4=reg128#11 +# asm 2: movddup 304(r4=%xmm10 +movddup 304(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r5=reg128#12 +# asm 2: movddup 368(r5=%xmm11 +movddup 368(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r7=reg128#14 +# asm 2: movddup 496(r7=%xmm13 +movddup 496(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 48 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 112 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 176 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 240 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 304 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 368 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 432 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 496 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 56(r0=%xmm6 +movddup 56(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r1=reg128#8 +# asm 2: movddup 120(r1=%xmm7 +movddup 120(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r2=reg128#9 +# asm 2: movddup 184(r2=%xmm8 +movddup 184(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r3=reg128#10 +# asm 2: movddup 248(r3=%xmm9 +movddup 248(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r4=reg128#11 +# asm 2: movddup 312(r4=%xmm10 +movddup 312(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r5=reg128#12 +# asm 2: movddup 376(r5=%xmm11 +movddup 376(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r6=reg128#13 +# asm 2: movddup 440(r6=%xmm12 +movddup 440(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm3,%rsi + +# qhasm: mem64[ input_0 + 56 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm7,%rsi + +# qhasm: mem64[ input_0 + 120 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 184 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm0,%rsi + +# qhasm: mem64[ input_0 + 248 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 312 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm2,%rsi + +# qhasm: mem64[ input_0 + 376 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm4,%rsi + +# qhasm: mem64[ input_0 + 440 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm1,%rsi + +# qhasm: mem64[ input_0 + 504 ] = buf +# asm 1: movq mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 8 ] x2 +# asm 1: movddup 8(r1=reg128#8 +# asm 2: movddup 8(r1=%xmm7 +movddup 8(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 16 ] x2 +# asm 1: movddup 16(r2=reg128#9 +# asm 2: movddup 16(r2=%xmm8 +movddup 16(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 24 ] x2 +# asm 1: movddup 24(r3=reg128#10 +# asm 2: movddup 24(r3=%xmm9 +movddup 24(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 32 ] x2 +# asm 1: movddup 32(r4=reg128#11 +# asm 2: movddup 32(r4=%xmm10 +movddup 32(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 40 ] x2 +# asm 1: movddup 40(r5=reg128#12 +# asm 2: movddup 40(r5=%xmm11 +movddup 40(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 48 ] x2 +# asm 1: movddup 48(r6=reg128#13 +# asm 2: movddup 48(r6=%xmm12 +movddup 48(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 56 ] x2 +# asm 1: movddup 56(r7=reg128#14 +# asm 2: movddup 56(r7=%xmm13 +movddup 56(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 0 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 16 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 32 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 64(r0=%xmm6 +movddup 64(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r2=reg128#9 +# asm 2: movddup 80(r2=%xmm8 +movddup 80(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r3=reg128#10 +# asm 2: movddup 88(r3=%xmm9 +movddup 88(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r4=reg128#11 +# asm 2: movddup 96(r4=%xmm10 +movddup 96(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r5=reg128#12 +# asm 2: movddup 104(r5=%xmm11 +movddup 104(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r6=reg128#13 +# asm 2: movddup 112(r6=%xmm12 +movddup 112(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r7=reg128#14 +# asm 2: movddup 120(r7=%xmm13 +movddup 120(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 64 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 80 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 96 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 112 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 128(r0=%xmm6 +movddup 128(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r1=reg128#8 +# asm 2: movddup 136(r1=%xmm7 +movddup 136(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r3=reg128#10 +# asm 2: movddup 152(r3=%xmm9 +movddup 152(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r4=reg128#11 +# asm 2: movddup 160(r4=%xmm10 +movddup 160(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r5=reg128#12 +# asm 2: movddup 168(r5=%xmm11 +movddup 168(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r6=reg128#13 +# asm 2: movddup 176(r6=%xmm12 +movddup 176(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r7=reg128#14 +# asm 2: movddup 184(r7=%xmm13 +movddup 184(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 128 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 144 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 160 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 176 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 192(r0=%xmm6 +movddup 192(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r1=reg128#8 +# asm 2: movddup 200(r1=%xmm7 +movddup 200(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r2=reg128#9 +# asm 2: movddup 208(r2=%xmm8 +movddup 208(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r4=reg128#11 +# asm 2: movddup 224(r4=%xmm10 +movddup 224(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r5=reg128#12 +# asm 2: movddup 232(r5=%xmm11 +movddup 232(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r6=reg128#13 +# asm 2: movddup 240(r6=%xmm12 +movddup 240(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r7=reg128#14 +# asm 2: movddup 248(r7=%xmm13 +movddup 248(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 192 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 208 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 224 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 240 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 256(r0=%xmm6 +movddup 256(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r1=reg128#8 +# asm 2: movddup 264(r1=%xmm7 +movddup 264(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r2=reg128#9 +# asm 2: movddup 272(r2=%xmm8 +movddup 272(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r3=reg128#10 +# asm 2: movddup 280(r3=%xmm9 +movddup 280(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r5=reg128#12 +# asm 2: movddup 296(r5=%xmm11 +movddup 296(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r6=reg128#13 +# asm 2: movddup 304(r6=%xmm12 +movddup 304(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r7=reg128#14 +# asm 2: movddup 312(r7=%xmm13 +movddup 312(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 256 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 272 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 288 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 304 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 320(r0=%xmm6 +movddup 320(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r1=reg128#8 +# asm 2: movddup 328(r1=%xmm7 +movddup 328(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r2=reg128#9 +# asm 2: movddup 336(r2=%xmm8 +movddup 336(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r3=reg128#10 +# asm 2: movddup 344(r3=%xmm9 +movddup 344(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r4=reg128#11 +# asm 2: movddup 352(r4=%xmm10 +movddup 352(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r6=reg128#13 +# asm 2: movddup 368(r6=%xmm12 +movddup 368(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r7=reg128#14 +# asm 2: movddup 376(r7=%xmm13 +movddup 376(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 320 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 336 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 352 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 368 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 384(r0=%xmm6 +movddup 384(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r1=reg128#8 +# asm 2: movddup 392(r1=%xmm7 +movddup 392(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r2=reg128#9 +# asm 2: movddup 400(r2=%xmm8 +movddup 400(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r3=reg128#10 +# asm 2: movddup 408(r3=%xmm9 +movddup 408(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r4=reg128#11 +# asm 2: movddup 416(r4=%xmm10 +movddup 416(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r5=reg128#12 +# asm 2: movddup 424(r5=%xmm11 +movddup 424(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r7=reg128#14 +# asm 2: movddup 440(r7=%xmm13 +movddup 440(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 384 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 400 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 416 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 432 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 448(r0=%xmm6 +movddup 448(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r1=reg128#8 +# asm 2: movddup 456(r1=%xmm7 +movddup 456(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r2=reg128#9 +# asm 2: movddup 464(r2=%xmm8 +movddup 464(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r3=reg128#10 +# asm 2: movddup 472(r3=%xmm9 +movddup 472(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r4=reg128#11 +# asm 2: movddup 480(r4=%xmm10 +movddup 480(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r5=reg128#12 +# asm 2: movddup 488(r5=%xmm11 +movddup 488(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r6=reg128#13 +# asm 2: movddup 496(r6=%xmm12 +movddup 496(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#4 +# asm 2: vpunpcklqdq t0=%xmm3 +vpunpcklqdq %xmm7,%xmm3,%xmm3 + +# qhasm: mem128[ input_0 + 448 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm0,%xmm8,%xmm0 + +# qhasm: mem128[ input_0 + 464 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm2,%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 480 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm1,%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 496 ] = t0 +# asm 1: movdqu s1=int64#2 +# asm 2: mov s1=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864_SSE_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE348864_SSE_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE348864_SSE_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE348864_SSE_irr_load(uint64_t *out, const unsigned char *in) { + int i, j; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE348864_SSE_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + out[i] = 0; + } + + for (i = SYS_T; i >= 0; i--) { + for (j = 0; j < GFBITS; j++) { + out[j] <<= 1; + out[j] |= (irr[i] >> j) & 1; + } + } +} + +void PQCLEAN_MCELIECE348864_SSE_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE348864_SSE_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE348864_SSE_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 4; +} + +vec128 PQCLEAN_MCELIECE348864_SSE_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE348864_SSE_vec128_set2x( PQCLEAN_MCELIECE348864_SSE_load8(in), PQCLEAN_MCELIECE348864_SSE_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE348864_SSE_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE348864_SSE_store8(out + 0, PQCLEAN_MCELIECE348864_SSE_vec128_extract(in, 0)); + PQCLEAN_MCELIECE348864_SSE_store8(out + 8, PQCLEAN_MCELIECE348864_SSE_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece348864/sse/util.h b/crypto_kem/mceliece348864/sse/util.h new file mode 100644 index 00000000..97491b39 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/util.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_UTIL_H +#define PQCLEAN_MCELIECE348864_SSE_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE348864_SSE_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE348864_SSE_store2(unsigned char *dest, gf a); + +uint16_t PQCLEAN_MCELIECE348864_SSE_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE348864_SSE_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE348864_SSE_irr_load(uint64_t *out, const unsigned char *in); + +void PQCLEAN_MCELIECE348864_SSE_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE348864_SSE_load8(const unsigned char *in); + +gf PQCLEAN_MCELIECE348864_SSE_bitrev(gf a); + +vec128 PQCLEAN_MCELIECE348864_SSE_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE348864_SSE_store16(unsigned char *out, vec128 in); + +#endif + diff --git a/crypto_kem/mceliece348864/sse/vec.c b/crypto_kem/mceliece348864/sse/vec.c new file mode 100644 index 00000000..82e40b26 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/vec.c @@ -0,0 +1,17 @@ + +#include "vec.h" + +#include "params.h" + +void PQCLEAN_MCELIECE348864_SSE_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g) { + PQCLEAN_MCELIECE348864_SSE_vec_mul_asm(h, f, g, 8); +} + +void PQCLEAN_MCELIECE348864_SSE_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g) { + int b; + + for (b = 0; b < GFBITS; b++) { + h[b] = f[b] ^ g[b]; + } +} + diff --git a/crypto_kem/mceliece348864/sse/vec.h b/crypto_kem/mceliece348864/sse/vec.h new file mode 100644 index 00000000..d33258e0 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/vec.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_VEC_H +#define PQCLEAN_MCELIECE348864_SSE_VEC_H + +#include + +extern void PQCLEAN_MCELIECE348864_SSE_vec_mul_asm(uint64_t *, const uint64_t *, const uint64_t *, int); + +void PQCLEAN_MCELIECE348864_SSE_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g); +void PQCLEAN_MCELIECE348864_SSE_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g); + +#endif diff --git a/crypto_kem/mceliece348864/sse/vec128.c b/crypto_kem/mceliece348864/sse/vec128.c new file mode 100644 index 00000000..219cb19c --- /dev/null +++ b/crypto_kem/mceliece348864/sse/vec128.c @@ -0,0 +1,143 @@ +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + +#include "vec128.h" + +#include "params.h" + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE348864_SSE_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE348864_SSE_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE348864_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE348864_SSE_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul_asm(h, f, g, 16); +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE348864_SSE_vec128_sq(vec128 *out, const vec128 *in) { + int i; + vec128 result[GFBITS]; + + result[0] = in[0] ^ in[6]; + result[1] = in[11]; + result[2] = in[1] ^ in[7]; + result[3] = in[6]; + result[4] = in[2] ^ in[11] ^ in[8]; + result[5] = in[7]; + result[6] = in[3] ^ in[9]; + result[7] = in[8]; + result[8] = in[4] ^ in[10]; + result[9] = in[9]; + result[10] = in[5] ^ in[11]; + result[11] = in[10]; + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE348864_SSE_vec128_inv(vec128 *out, const vec128 *in) { + vec128 tmp_11[ GFBITS ]; + vec128 tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE348864_SSE_vec128_copy(out, in); + + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tmp_11, out, in); // 11 + + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, tmp_11); + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tmp_1111, out, tmp_11); // 1111 + + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, tmp_1111); + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(out, out, tmp_1111); // 11111111 + + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(out, out, tmp_11); // 1111111111 + + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(out, out, in); // 11111111111 + + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); // 111111111110 +} + diff --git a/crypto_kem/mceliece348864/sse/vec128.h b/crypto_kem/mceliece348864/sse/vec128.h new file mode 100644 index 00000000..e002e77f --- /dev/null +++ b/crypto_kem/mceliece348864/sse/vec128.h @@ -0,0 +1,42 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_VEC128_H +#define PQCLEAN_MCELIECE348864_SSE_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE348864_SSE_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE348864_SSE_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE348864_SSE_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE348864_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE348864_SSE_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +void PQCLEAN_MCELIECE348864_SSE_vec128_sq(vec128 * /*out*/, const vec128 * /*in*/); +void PQCLEAN_MCELIECE348864_SSE_vec128_inv(vec128 * /*out*/, const vec128 * /*in*/); + +#endif diff --git a/crypto_kem/mceliece348864/sse/vec128_mul_asm.S b/crypto_kem/mceliece348864/sse/vec128_mul_asm.S new file mode 100644 index 00000000..f9c2753b --- /dev/null +++ b/crypto_kem/mceliece348864/sse/vec128_mul_asm.S @@ -0,0 +1,1736 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 a0 + +# qhasm: reg128 a1 + +# qhasm: reg128 a2 + +# qhasm: reg128 a3 + +# qhasm: reg128 a4 + +# qhasm: reg128 a5 + +# qhasm: reg128 a6 + +# qhasm: reg128 a7 + +# qhasm: reg128 a8 + +# qhasm: reg128 a9 + +# qhasm: reg128 a10 + +# qhasm: reg128 a11 + +# qhasm: reg128 b0 + +# qhasm: reg128 b1 + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 r5 + +# qhasm: reg128 r6 + +# qhasm: reg128 r7 + +# qhasm: reg128 r8 + +# qhasm: reg128 r9 + +# qhasm: reg128 r10 + +# qhasm: reg128 r11 + +# qhasm: reg128 r12 + +# qhasm: reg128 r13 + +# qhasm: reg128 r14 + +# qhasm: reg128 r15 + +# qhasm: reg128 r16 + +# qhasm: reg128 r17 + +# qhasm: reg128 r18 + +# qhasm: reg128 r19 + +# qhasm: reg128 r20 + +# qhasm: reg128 r21 + +# qhasm: reg128 r22 + +# qhasm: reg128 r + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_SSE_vec128_mul_asm +.global PQCLEAN_MCELIECE348864_SSE_vec128_mul_asm +_PQCLEAN_MCELIECE348864_SSE_vec128_mul_asm: +PQCLEAN_MCELIECE348864_SSE_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(b0=reg128#1 +# asm 2: movdqu 0(b0=%xmm0 +movdqu 0(%rdx),%xmm0 + +# qhasm: a11 = mem128[ input_1 + 176 ] +# asm 1: movdqu 176(a11=reg128#2 +# asm 2: movdqu 176(a11=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: r11 = a11 & b0 +# asm 1: vpand r11=reg128#3 +# asm 2: vpand r11=%xmm2 +vpand %xmm0,%xmm1,%xmm2 + +# qhasm: r12 = a11 & mem128[input_2 + 16] +# asm 1: vpand 16(r12=reg128#4 +# asm 2: vpand 16(r12=%xmm3 +vpand 16(%rdx),%xmm1,%xmm3 + +# qhasm: r13 = a11 & mem128[input_2 + 32] +# asm 1: vpand 32(r13=reg128#5 +# asm 2: vpand 32(r13=%xmm4 +vpand 32(%rdx),%xmm1,%xmm4 + +# qhasm: r14 = a11 & mem128[input_2 + 48] +# asm 1: vpand 48(r14=reg128#6 +# asm 2: vpand 48(r14=%xmm5 +vpand 48(%rdx),%xmm1,%xmm5 + +# qhasm: r15 = a11 & mem128[input_2 + 64] +# asm 1: vpand 64(r15=reg128#7 +# asm 2: vpand 64(r15=%xmm6 +vpand 64(%rdx),%xmm1,%xmm6 + +# qhasm: r16 = a11 & mem128[input_2 + 80] +# asm 1: vpand 80(r16=reg128#8 +# asm 2: vpand 80(r16=%xmm7 +vpand 80(%rdx),%xmm1,%xmm7 + +# qhasm: r17 = a11 & mem128[input_2 + 96] +# asm 1: vpand 96(r17=reg128#9 +# asm 2: vpand 96(r17=%xmm8 +vpand 96(%rdx),%xmm1,%xmm8 + +# qhasm: r18 = a11 & mem128[input_2 + 112] +# asm 1: vpand 112(r18=reg128#10 +# asm 2: vpand 112(r18=%xmm9 +vpand 112(%rdx),%xmm1,%xmm9 + +# qhasm: r19 = a11 & mem128[input_2 + 128] +# asm 1: vpand 128(r19=reg128#11 +# asm 2: vpand 128(r19=%xmm10 +vpand 128(%rdx),%xmm1,%xmm10 + +# qhasm: r20 = a11 & mem128[input_2 + 144] +# asm 1: vpand 144(r20=reg128#12 +# asm 2: vpand 144(r20=%xmm11 +vpand 144(%rdx),%xmm1,%xmm11 + +# qhasm: r21 = a11 & mem128[input_2 + 160] +# asm 1: vpand 160(r21=reg128#13 +# asm 2: vpand 160(r21=%xmm12 +vpand 160(%rdx),%xmm1,%xmm12 + +# qhasm: r22 = a11 & mem128[input_2 + 176] +# asm 1: vpand 176(r22=reg128#2 +# asm 2: vpand 176(r22=%xmm1 +vpand 176(%rdx),%xmm1,%xmm1 + +# qhasm: r13 ^= r22 +# asm 1: pxor r10=reg128#2 +# asm 2: movdqa r10=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: a10 = mem128[ input_1 + 160 ] +# asm 1: movdqu 160(a10=reg128#14 +# asm 2: movdqu 160(a10=%xmm13 +movdqu 160(%rsi),%xmm13 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r21 ^= r +# asm 1: pxor r9=reg128#13 +# asm 2: movdqa r9=%xmm12 +movdqa %xmm12,%xmm12 + +# qhasm: a9 = mem128[ input_1 + 144 ] +# asm 1: movdqu 144(a9=reg128#14 +# asm 2: movdqu 144(a9=%xmm13 +movdqu 144(%rsi),%xmm13 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r20 ^= r +# asm 1: pxor r8=reg128#12 +# asm 2: movdqa r8=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: a8 = mem128[ input_1 + 128 ] +# asm 1: movdqu 128(a8=reg128#14 +# asm 2: movdqu 128(a8=%xmm13 +movdqu 128(%rsi),%xmm13 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r19 ^= r +# asm 1: pxor r7=reg128#11 +# asm 2: movdqa r7=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: a7 = mem128[ input_1 + 112 ] +# asm 1: movdqu 112(a7=reg128#14 +# asm 2: movdqu 112(a7=%xmm13 +movdqu 112(%rsi),%xmm13 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r18 ^= r +# asm 1: pxor r6=reg128#10 +# asm 2: movdqa r6=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: a6 = mem128[ input_1 + 96 ] +# asm 1: movdqu 96(a6=reg128#14 +# asm 2: movdqu 96(a6=%xmm13 +movdqu 96(%rsi),%xmm13 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r17 ^= r +# asm 1: pxor r5=reg128#9 +# asm 2: movdqa r5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: a5 = mem128[ input_1 + 80 ] +# asm 1: movdqu 80(a5=reg128#14 +# asm 2: movdqu 80(a5=%xmm13 +movdqu 80(%rsi),%xmm13 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r16 ^= r +# asm 1: pxor r4=reg128#8 +# asm 2: movdqa r4=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: a4 = mem128[ input_1 + 64 ] +# asm 1: movdqu 64(a4=reg128#14 +# asm 2: movdqu 64(a4=%xmm13 +movdqu 64(%rsi),%xmm13 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r15 ^= r +# asm 1: pxor r3=reg128#7 +# asm 2: movdqa r3=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: a3 = mem128[ input_1 + 48 ] +# asm 1: movdqu 48(a3=reg128#14 +# asm 2: movdqu 48(a3=%xmm13 +movdqu 48(%rsi),%xmm13 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r14 ^= r +# asm 1: pxor r2=reg128#6 +# asm 2: movdqa r2=%xmm5 +movdqa %xmm5,%xmm5 + +# qhasm: a2 = mem128[ input_1 + 32 ] +# asm 1: movdqu 32(a2=reg128#14 +# asm 2: movdqu 32(a2=%xmm13 +movdqu 32(%rsi),%xmm13 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r13 ^= r +# asm 1: pxor r1=reg128#5 +# asm 2: movdqa r1=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: a1 = mem128[ input_1 + 16 ] +# asm 1: movdqu 16(a1=reg128#14 +# asm 2: movdqu 16(a1=%xmm13 +movdqu 16(%rsi),%xmm13 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r12 ^= r +# asm 1: pxor r0=reg128#4 +# asm 2: movdqa r0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: a0 = mem128[ input_1 + 0 ] +# asm 1: movdqu 0(a0=reg128#14 +# asm 2: movdqu 0(a0=%xmm13 +movdqu 0(%rsi),%xmm13 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: r0 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 16(r=%xmm0 +vpand 16(%rdx),%xmm13,%xmm0 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 32(r=%xmm0 +vpand 32(%rdx),%xmm13,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 48(r=%xmm0 +vpand 48(%rdx),%xmm13,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 64(r=%xmm0 +vpand 64(%rdx),%xmm13,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 80(r=%xmm0 +vpand 80(%rdx),%xmm13,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 96(r=%xmm0 +vpand 96(%rdx),%xmm13,%xmm0 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 112(r=%xmm0 +vpand 112(%rdx),%xmm13,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 128(r=%xmm0 +vpand 128(%rdx),%xmm13,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 144(r=%xmm0 +vpand 144(%rdx),%xmm13,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 160(r=%xmm0 +vpand 160(%rdx),%xmm13,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 176(r=%xmm0 +vpand 176(%rdx),%xmm13,%xmm0 + +# qhasm: r11 ^= r +# asm 1: pxor r11_stack=stack64#1 +# asm 2: movq r11_stack=608(%rsp) +movq %r11,608(%rsp) + +# qhasm: r12_stack = caller_r12 +# asm 1: movq r12_stack=stack64#2 +# asm 2: movq r12_stack=616(%rsp) +movq %r12,616(%rsp) + +# qhasm: r13_stack = caller_r13 +# asm 1: movq r13_stack=stack64#3 +# asm 2: movq r13_stack=624(%rsp) +movq %r13,624(%rsp) + +# qhasm: r14_stack = caller_r14 +# asm 1: movq r14_stack=stack64#4 +# asm 2: movq r14_stack=632(%rsp) +movq %r14,632(%rsp) + +# qhasm: r15_stack = caller_r15 +# asm 1: movq r15_stack=stack64#5 +# asm 2: movq r15_stack=640(%rsp) +movq %r15,640(%rsp) + +# qhasm: rbx_stack = caller_rbx +# asm 1: movq rbx_stack=stack64#6 +# asm 2: movq rbx_stack=648(%rsp) +movq %rbx,648(%rsp) + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 11 +# asm 1: imulq $11,tmp=int64#6 +# asm 2: imulq $11,tmp=%r9 +imulq $11,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b11=reg128#1 +# asm 2: movddup 0(b11=%xmm0 +movddup 0(%rdx),%xmm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub r16=reg128#3 +# asm 2: vpand r16=%xmm2 +vpand %xmm1,%xmm0,%xmm2 + +# qhasm: mem128[ ptr + 256 ] = r16 +# asm 1: movdqu r15=reg128#4 +# asm 2: vpand r15=%xmm3 +vpand %xmm2,%xmm0,%xmm3 + +# qhasm: a3[0] = mem64[ input_1 + 24 ] +# asm 1: pinsrq $0x0,24(r14=reg128#6 +# asm 2: vpand r14=%xmm5 +vpand %xmm4,%xmm0,%xmm5 + +# qhasm: a2[0] = mem64[ input_1 + 16 ] +# asm 1: pinsrq $0x0,16(r13=reg128#8 +# asm 2: vpand r13=%xmm7 +vpand %xmm6,%xmm0,%xmm7 + +# qhasm: a1[0] = mem64[ input_1 + 8 ] +# asm 1: pinsrq $0x0,8(r12=reg128#10 +# asm 2: vpand r12=%xmm9 +vpand %xmm8,%xmm0,%xmm9 + +# qhasm: a0[0] = mem64[ input_1 + 0 ] +# asm 1: pinsrq $0x0,0(r11=reg128#1 +# asm 2: vpand r11=%xmm0 +vpand %xmm10,%xmm0,%xmm0 + +# qhasm: b10 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b10=reg128#12 +# asm 2: movddup 0(b10=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm2,%xmm11,%xmm3 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm4,%xmm11,%xmm3 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm6,%xmm11,%xmm3 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm8,%xmm11,%xmm3 + +# qhasm: r11 ^= r +# asm 1: pxor r10=reg128#4 +# asm 2: vpand r10=%xmm3 +vpand %xmm10,%xmm11,%xmm3 + +# qhasm: b9 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b9=reg128#12 +# asm 2: movddup 0(b9=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm2,%xmm11,%xmm5 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm4,%xmm11,%xmm5 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm6,%xmm11,%xmm5 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm8,%xmm11,%xmm5 + +# qhasm: r10 ^= r +# asm 1: pxor r9=reg128#6 +# asm 2: vpand r9=%xmm5 +vpand %xmm10,%xmm11,%xmm5 + +# qhasm: b8 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b8=reg128#12 +# asm 2: movddup 0(b8=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm2,%xmm11,%xmm7 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm4,%xmm11,%xmm7 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm6,%xmm11,%xmm7 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm8,%xmm11,%xmm7 + +# qhasm: r9 ^= r +# asm 1: pxor r8=reg128#8 +# asm 2: vpand r8=%xmm7 +vpand %xmm10,%xmm11,%xmm7 + +# qhasm: b7 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b7=reg128#12 +# asm 2: movddup 0(b7=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm2,%xmm11,%xmm9 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm4,%xmm11,%xmm9 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm6,%xmm11,%xmm9 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm8,%xmm11,%xmm9 + +# qhasm: r8 ^= r +# asm 1: pxor r7=reg128#10 +# asm 2: vpand r7=%xmm9 +vpand %xmm10,%xmm11,%xmm9 + +# qhasm: b6 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b6=reg128#12 +# asm 2: movddup 0(b6=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm2,%xmm11,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm4,%xmm11,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm6,%xmm11,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm8,%xmm11,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r6=reg128#1 +# asm 2: vpand r6=%xmm0 +vpand %xmm10,%xmm11,%xmm0 + +# qhasm: b5 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b5=reg128#12 +# asm 2: movddup 0(b5=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm2,%xmm11,%xmm3 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm4,%xmm11,%xmm3 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm6,%xmm11,%xmm3 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm8,%xmm11,%xmm3 + +# qhasm: r6 ^= r +# asm 1: pxor r5=reg128#4 +# asm 2: vpand r5=%xmm3 +vpand %xmm10,%xmm11,%xmm3 + +# qhasm: b4 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b4=reg128#12 +# asm 2: movddup 0(b4=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm2,%xmm11,%xmm5 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm4,%xmm11,%xmm5 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm6,%xmm11,%xmm5 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm8,%xmm11,%xmm5 + +# qhasm: r5 ^= r +# asm 1: pxor r4=reg128#6 +# asm 2: vpand r4=%xmm5 +vpand %xmm10,%xmm11,%xmm5 + +# qhasm: b3 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b3=reg128#12 +# asm 2: movddup 0(b3=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm2,%xmm11,%xmm7 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm4,%xmm11,%xmm7 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm6,%xmm11,%xmm7 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm8,%xmm11,%xmm7 + +# qhasm: r4 ^= r +# asm 1: pxor r3=reg128#8 +# asm 2: vpand r3=%xmm7 +vpand %xmm10,%xmm11,%xmm7 + +# qhasm: b2 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b2=reg128#12 +# asm 2: movddup 0(b2=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm2,%xmm11,%xmm9 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm4,%xmm11,%xmm9 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm6,%xmm11,%xmm9 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm8,%xmm11,%xmm9 + +# qhasm: r3 ^= r +# asm 1: pxor r2=reg128#10 +# asm 2: vpand r2=%xmm9 +vpand %xmm10,%xmm11,%xmm9 + +# qhasm: b1 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b1=reg128#12 +# asm 2: movddup 0(b1=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm2,%xmm11,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm4,%xmm11,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm6,%xmm11,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm8,%xmm11,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r1=reg128#1 +# asm 2: vpand r1=%xmm0 +vpand %xmm10,%xmm11,%xmm0 + +# qhasm: b0 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b0=reg128#12 +# asm 2: movddup 0(b0=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm1,%xmm11,%xmm1 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm2,%xmm11,%xmm1 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm4,%xmm11,%xmm1 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm6,%xmm11,%xmm1 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm8,%xmm11,%xmm1 + +# qhasm: r1 ^= r +# asm 1: pxor r0=reg128#2 +# asm 2: vpand r0=%xmm1 +vpand %xmm10,%xmm11,%xmm1 + +# qhasm: mem128[ ptr + 64 ] = r4 +# asm 1: movdqu h22=int64#2 +# asm 2: movq 264(h22=%rsi +movq 264(%r8),%rsi + +# qhasm: h13 = h22 +# asm 1: mov h13=int64#3 +# asm 2: mov h13=%rdx +mov %rsi,%rdx + +# qhasm: h10 = h22 +# asm 1: mov h10=int64#2 +# asm 2: mov h10=%rsi +mov %rsi,%rsi + +# qhasm: h21 = mem64[ ptr + 248 ] +# asm 1: movq 248(h21=int64#4 +# asm 2: movq 248(h21=%rcx +movq 248(%r8),%rcx + +# qhasm: h12 = h21 +# asm 1: mov h12=int64#6 +# asm 2: mov h12=%r9 +mov %rcx,%r9 + +# qhasm: h9 = h21 +# asm 1: mov h9=int64#4 +# asm 2: mov h9=%rcx +mov %rcx,%rcx + +# qhasm: h20 = mem64[ ptr + 232 ] +# asm 1: movq 232(h20=int64#7 +# asm 2: movq 232(h20=%rax +movq 232(%r8),%rax + +# qhasm: h11 = h20 +# asm 1: mov h11=int64#8 +# asm 2: mov h11=%r10 +mov %rax,%r10 + +# qhasm: h8 = h20 +# asm 1: mov h8=int64#7 +# asm 2: mov h8=%rax +mov %rax,%rax + +# qhasm: h19 = mem64[ ptr + 216 ] +# asm 1: movq 216(h19=int64#9 +# asm 2: movq 216(h19=%r11 +movq 216(%r8),%r11 + +# qhasm: h10 ^= h19 +# asm 1: xor h7=int64#9 +# asm 2: mov h7=%r11 +mov %r11,%r11 + +# qhasm: h18 = mem64[ ptr + 200 ] +# asm 1: movq 200(h18=int64#10 +# asm 2: movq 200(h18=%r12 +movq 200(%r8),%r12 + +# qhasm: h9 ^= h18 +# asm 1: xor h6=int64#10 +# asm 2: mov h6=%r12 +mov %r12,%r12 + +# qhasm: h17 = mem64[ ptr + 184 ] +# asm 1: movq 184(h17=int64#11 +# asm 2: movq 184(h17=%r13 +movq 184(%r8),%r13 + +# qhasm: h8 ^= h17 +# asm 1: xor h5=int64#11 +# asm 2: mov h5=%r13 +mov %r13,%r13 + +# qhasm: h16 = mem64[ ptr + 168 ] +# asm 1: movq 168(h16=int64#12 +# asm 2: movq 168(h16=%r14 +movq 168(%r8),%r14 + +# qhasm: h16 ^= *(uint64 *) ( ptr + 256 ) +# asm 1: xorq 256(h4=int64#12 +# asm 2: mov h4=%r14 +mov %r14,%r14 + +# qhasm: h15 = mem64[ ptr + 152 ] +# asm 1: movq 152(h15=int64#13 +# asm 2: movq 152(h15=%r15 +movq 152(%r8),%r15 + +# qhasm: h15 ^= *(uint64 *) ( ptr + 240 ) +# asm 1: xorq 240(h3=int64#13 +# asm 2: mov h3=%r15 +mov %r15,%r15 + +# qhasm: h14 = mem64[ ptr + 136 ] +# asm 1: movq 136(h14=int64#14 +# asm 2: movq 136(h14=%rbx +movq 136(%r8),%rbx + +# qhasm: h14 ^= *(uint64 *) ( ptr + 224 ) +# asm 1: xorq 224(h2=int64#14 +# asm 2: mov h2=%rbx +mov %rbx,%rbx + +# qhasm: h13 ^= *(uint64 *) ( ptr + 120 ) +# asm 1: xorq 120(h1=int64#3 +# asm 2: mov h1=%rdx +mov %rdx,%rdx + +# qhasm: h12 ^= *(uint64 *) ( ptr + 104 ) +# asm 1: xorq 104(h0=int64#6 +# asm 2: mov h0=%r9 +mov %r9,%r9 + +# qhasm: h11 ^= *(uint64 *) ( ptr + 176 ) +# asm 1: xorq 176(caller_r11=int64#9 +# asm 2: movq caller_r11=%r11 +movq 608(%rsp),%r11 + +# qhasm: caller_r12 = r12_stack +# asm 1: movq caller_r12=int64#10 +# asm 2: movq caller_r12=%r12 +movq 616(%rsp),%r12 + +# qhasm: caller_r13 = r13_stack +# asm 1: movq caller_r13=int64#11 +# asm 2: movq caller_r13=%r13 +movq 624(%rsp),%r13 + +# qhasm: caller_r14 = r14_stack +# asm 1: movq caller_r14=int64#12 +# asm 2: movq caller_r14=%r14 +movq 632(%rsp),%r14 + +# qhasm: caller_r15 = r15_stack +# asm 1: movq caller_r15=int64#13 +# asm 2: movq caller_r15=%r15 +movq 640(%rsp),%r15 + +# qhasm: caller_rbx = rbx_stack +# asm 1: movq caller_rbx=int64#14 +# asm 2: movq caller_rbx=%rbx +movq 648(%rsp),%rbx + +# qhasm: return +add %r11,%rsp +ret diff --git a/crypto_kem/mceliece348864/sse/vec_reduce_asm.S b/crypto_kem/mceliece348864/sse/vec_reduce_asm.S new file mode 100644 index 00000000..0c9caf14 --- /dev/null +++ b/crypto_kem/mceliece348864/sse/vec_reduce_asm.S @@ -0,0 +1,356 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 t + +# qhasm: int64 c + +# qhasm: int64 r + +# qhasm: enter vec_reduce_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm +.global PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm +_PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm: +PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: r = 0 +# asm 1: mov $0,>r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t = mem64[ input_0 + 88 ] +# asm 1: movq 88(t=int64#2 +# asm 2: movq 88(t=%rsi +movq 88(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 80(t=%rsi +movq 80(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 72(t=%rsi +movq 72(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 64(t=%rsi +movq 64(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 56(t=%rsi +movq 56(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 48(t=%rsi +movq 48(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 40(t=%rsi +movq 40(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 32(t=%rsi +movq 32(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 24(t=%rsi +movq 24(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 16(t=%rsi +movq 16(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 8(t=%rsi +movq 8(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#1 +# asm 2: movq 0(t=%rdi +movq 0(%rdi),%rdi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rdi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE348864_VEC_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece348864/vec/api.h b/crypto_kem/mceliece348864/vec/api.h new file mode 100644 index 00000000..5f1f8d18 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_API_H +#define PQCLEAN_MCELIECE348864_VEC_API_H + +#include + +#define PQCLEAN_MCELIECE348864_VEC_CRYPTO_ALGNAME "Classic McEliece 348864" +#define PQCLEAN_MCELIECE348864_VEC_CRYPTO_PUBLICKEYBYTES 261120 +#define PQCLEAN_MCELIECE348864_VEC_CRYPTO_SECRETKEYBYTES 6452 +#define PQCLEAN_MCELIECE348864_VEC_CRYPTO_CIPHERTEXTBYTES 128 +#define PQCLEAN_MCELIECE348864_VEC_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE348864_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE348864_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE348864_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece348864/vec/benes.c b/crypto_kem/mceliece348864/vec/benes.c new file mode 100644 index 00000000..bd45278f --- /dev/null +++ b/crypto_kem/mceliece348864/vec/benes.c @@ -0,0 +1,95 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +/* one layer of the benes network */ +static void layer(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE348864_VEC_benes(uint64_t *r, const unsigned char *bits, int rev) { + int i; + + const unsigned char *cond_ptr; + int inc, low; + + uint64_t cond[64]; + + // + + if (rev == 0) { + inc = 256; + cond_ptr = bits; + } else { + inc = -256; + cond_ptr = bits + (2 * GFBITS - 2) * 256; + } + + // + + PQCLEAN_MCELIECE348864_VEC_transpose_64x64(r, r); + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_VEC_load4(cond_ptr + i * 4); + } + PQCLEAN_MCELIECE348864_VEC_transpose_64x64(cond, cond); + layer(r, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864_VEC_transpose_64x64(r, r); + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 32; i++) { + cond[i] = PQCLEAN_MCELIECE348864_VEC_load8(cond_ptr + i * 8); + } + layer(r, cond, low); + cond_ptr += inc; + } + for (low = 4; low >= 0; low--) { + for (i = 0; i < 32; i++) { + cond[i] = PQCLEAN_MCELIECE348864_VEC_load8(cond_ptr + i * 8); + } + layer(r, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864_VEC_transpose_64x64(r, r); + + for (low = 5; low >= 0; low--) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_VEC_load4(cond_ptr + i * 4); + } + PQCLEAN_MCELIECE348864_VEC_transpose_64x64(cond, cond); + layer(r, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864_VEC_transpose_64x64(r, r); +} + diff --git a/crypto_kem/mceliece348864/vec/benes.h b/crypto_kem/mceliece348864/vec/benes.h new file mode 100644 index 00000000..281c9e69 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/benes.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_BENES_H +#define PQCLEAN_MCELIECE348864_VEC_BENES_H +/* + This file is for Benes network related functions +*/ + +#include + +void PQCLEAN_MCELIECE348864_VEC_benes(uint64_t * /*r*/, const unsigned char * /*bits*/, int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece348864/vec/bm.c b/crypto_kem/mceliece348864/vec/bm.c new file mode 100644 index 00000000..6a87164c --- /dev/null +++ b/crypto_kem/mceliece348864/vec/bm.c @@ -0,0 +1,247 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "util.h" + +#include + +static inline uint64_t mask_nonzero(gf a) { + uint64_t ret = a; + + ret -= 1; + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline uint64_t mask_leq(uint16_t a, uint16_t b) { + uint64_t a_tmp = a; + uint64_t b_tmp = b; + uint64_t ret = b_tmp - a_tmp; + + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline void vec_cmov(vec *out, const vec *in, uint16_t mask) { + int i; + + vec m0, m1; + + m0 = PQCLEAN_MCELIECE348864_VEC_vec_set1_16b(mask); + m1 = ~m0; + + for (i = 0; i < GFBITS; i++) { + out[i] = (in[i] & m0) | (out[i] & m1); + out[i] = (in[i] & m0) | (out[i] & m1); + } +} + +static inline void interleave(vec *in, int idx0, int idx1, const vec *mask, int b) { + int s = 1 << b; + + vec x, y; + + x = (in[idx0] & mask[0]) | ((in[idx1] & mask[0]) << s); + y = ((in[idx0] & mask[1]) >> s) | (in[idx1] & mask[1]); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, const vec *in) { + int i, k; + + vec mask[4][2]; + vec buf[16]; + + for (i = 0; i < GFBITS; i++) { + buf[i] = in[i]; + } + for (i = GFBITS; i < 16; i++) { + buf[i] = 0; + } + + mask[0][0] = PQCLEAN_MCELIECE348864_VEC_vec_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE348864_VEC_vec_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE348864_VEC_vec_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE348864_VEC_vec_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE348864_VEC_vec_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE348864_VEC_vec_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE348864_VEC_vec_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE348864_VEC_vec_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ k * 16 + i ] = (buf[i] >> (k * 16)) & GFMASK; + } + } +} + +static inline gf vec_reduce(const vec *in) { + int i; + vec tmp; + gf ret = 0; + + for (i = GFBITS - 1; i >= 0; i--) { + tmp = in[i]; + + tmp ^= tmp >> 32; + tmp ^= tmp >> 16; + tmp ^= tmp >> 8; + tmp ^= tmp >> 4; + tmp ^= tmp >> 2; + tmp ^= tmp >> 1; + + ret <<= 1; + ret |= tmp & 1; + } + + return ret; +} + +static void update(vec *in, const gf e) { + int i; + vec tmp; + + for (i = 0; i < GFBITS; i++) { + tmp = (e >> i) & 1; + + in[i] = (in[i] >> 1) | (tmp << 63); + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE348864_VEC_bm(vec *out, vec in[][ GFBITS ]) { + uint16_t i; + uint16_t N, L; + + vec prod[ GFBITS ]; + vec in_tmp[ GFBITS ]; + + vec d_vec[ GFBITS ]; + vec b_vec[ GFBITS ]; + vec B[ GFBITS ], C[ GFBITS ]; + vec B_tmp[ GFBITS ], C_tmp[ GFBITS ]; + + vec mask, t; + + gf d, b, c0 = 1; + + gf coefs[SYS_T * 2]; + + // init + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[ 64], in[1]); + + C[0] = 0; + B[0] = 1; + B[0] <<= 63; + + for (i = 1; i < GFBITS; i++) { + B[i] = C[i] = 0; + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + in_tmp[i] = 0; + } + + for (N = 0; N < SYS_T * 2; N++) { + // computing d + + PQCLEAN_MCELIECE348864_VEC_vec_mul(prod, in_tmp, C); + + update(in_tmp, coefs[N]); + + d = vec_reduce(prod); + + t = PQCLEAN_MCELIECE348864_VEC_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + // 3 cases + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + d_vec[i] = PQCLEAN_MCELIECE348864_VEC_vec_setbits((d >> i) & 1); + b_vec[i] = PQCLEAN_MCELIECE348864_VEC_vec_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE348864_VEC_vec_mul(B_tmp, d_vec, B); + PQCLEAN_MCELIECE348864_VEC_vec_mul(C_tmp, b_vec, C); + + vec_cmov(B, C, (uint16_t)mask); + update(B, mask & c0); + + for (i = 0; i < GFBITS; i++) { + C[i] = B_tmp[i] ^ C_tmp[i]; + } + + c0 = (gf)(t >> 32); + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + + } + + c0 = PQCLEAN_MCELIECE348864_VEC_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + out[i] = PQCLEAN_MCELIECE348864_VEC_vec_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE348864_VEC_vec_mul(out, out, C); +} + diff --git a/crypto_kem/mceliece348864/vec/bm.h b/crypto_kem/mceliece348864/vec/bm.h new file mode 100644 index 00000000..79ffd441 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/bm.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_BM_H +#define PQCLEAN_MCELIECE348864_VEC_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE348864_VEC_bm(vec * /*out*/, vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece348864/vec/consts.inc b/crypto_kem/mceliece348864/vec/consts.inc new file mode 100644 index 00000000..a728344f --- /dev/null +++ b/crypto_kem/mceliece348864/vec/consts.inc @@ -0,0 +1,888 @@ +//64 +{ + 0XF00F0FF0F00F0FF0, + 0XF0F00F0F0F0FF0F0, + 0X0FF00FF00FF00FF0, + 0XAA5555AAAA5555AA, + 0XF00F0FF0F00F0FF0, + 0X33CCCC33CC3333CC, + 0XFFFF0000FFFF0000, + 0XCC33CC3333CC33CC, + 0X33CC33CC33CC33CC, + 0X5A5A5A5A5A5A5A5A, + 0XFF00FF00FF00FF00, + 0XF00F0FF0F00F0FF0, +}, +//128 +{ + 0X3C3C3C3C3C3C3C3C, + 0XF0F0F0F0F0F0F0F0, + 0X5555AAAA5555AAAA, + 0XCC3333CCCC3333CC, + 0XC33CC33CC33CC33C, + 0X55555555AAAAAAAA, + 0X33333333CCCCCCCC, + 0X00FF00FFFF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0X0000000000000000, + 0X0000FFFFFFFF0000, + 0XF0F00F0F0F0FF0F0, +}, +{ + 0X3C3C3C3C3C3C3C3C, + 0X0F0F0F0F0F0F0F0F, + 0XAAAA5555AAAA5555, + 0XCC3333CCCC3333CC, + 0XC33CC33CC33CC33C, + 0X55555555AAAAAAAA, + 0X33333333CCCCCCCC, + 0XFF00FF0000FF00FF, + 0X0F0F0F0F0F0F0F0F, + 0X0000000000000000, + 0X0000FFFFFFFF0000, + 0XF0F00F0F0F0FF0F0, +}, +//256 +{ + 0XAA55AA5555AA55AA, + 0XCC33CC3333CC33CC, + 0X33CCCC33CC3333CC, + 0X55555555AAAAAAAA, + 0XFF0000FF00FFFF00, + 0X3CC33CC3C33CC33C, + 0X5555AAAA5555AAAA, + 0X0FF00FF00FF00FF0, + 0XCCCC33333333CCCC, + 0XF0F0F0F0F0F0F0F0, + 0X00FFFF0000FFFF00, + 0XC33CC33CC33CC33C, +}, +{ + 0X55AA55AAAA55AA55, + 0XCC33CC3333CC33CC, + 0XCC3333CC33CCCC33, + 0X55555555AAAAAAAA, + 0XFF0000FF00FFFF00, + 0XC33CC33C3CC33CC3, + 0XAAAA5555AAAA5555, + 0XF00FF00FF00FF00F, + 0X3333CCCCCCCC3333, + 0X0F0F0F0F0F0F0F0F, + 0XFF0000FFFF0000FF, + 0XC33CC33CC33CC33C, +}, +{ + 0XAA55AA5555AA55AA, + 0X33CC33CCCC33CC33, + 0XCC3333CC33CCCC33, + 0X55555555AAAAAAAA, + 0X00FFFF00FF0000FF, + 0X3CC33CC3C33CC33C, + 0X5555AAAA5555AAAA, + 0X0FF00FF00FF00FF0, + 0X3333CCCCCCCC3333, + 0XF0F0F0F0F0F0F0F0, + 0X00FFFF0000FFFF00, + 0XC33CC33CC33CC33C, +}, +{ + 0X55AA55AAAA55AA55, + 0X33CC33CCCC33CC33, + 0X33CCCC33CC3333CC, + 0X55555555AAAAAAAA, + 0X00FFFF00FF0000FF, + 0XC33CC33C3CC33CC3, + 0XAAAA5555AAAA5555, + 0XF00FF00FF00FF00F, + 0XCCCC33333333CCCC, + 0X0F0F0F0F0F0F0F0F, + 0XFF0000FFFF0000FF, + 0XC33CC33CC33CC33C, +}, +//512 +{ + 0X6699669999669966, + 0X33CCCC33CC3333CC, + 0XA5A5A5A55A5A5A5A, + 0X3C3CC3C3C3C33C3C, + 0XF00FF00F0FF00FF0, + 0X55AA55AA55AA55AA, + 0X3C3CC3C3C3C33C3C, + 0X0F0F0F0FF0F0F0F0, + 0X55AA55AA55AA55AA, + 0X33CCCC33CC3333CC, + 0XF0F0F0F0F0F0F0F0, + 0XA55A5AA55AA5A55A, +}, +{ + 0X9966996666996699, + 0X33CCCC33CC3333CC, + 0XA5A5A5A55A5A5A5A, + 0X3C3CC3C3C3C33C3C, + 0X0FF00FF0F00FF00F, + 0XAA55AA55AA55AA55, + 0X3C3CC3C3C3C33C3C, + 0XF0F0F0F00F0F0F0F, + 0XAA55AA55AA55AA55, + 0XCC3333CC33CCCC33, + 0X0F0F0F0F0F0F0F0F, + 0XA55A5AA55AA5A55A, +}, +{ + 0X6699669999669966, + 0X33CCCC33CC3333CC, + 0X5A5A5A5AA5A5A5A5, + 0XC3C33C3C3C3CC3C3, + 0X0FF00FF0F00FF00F, + 0XAA55AA55AA55AA55, + 0XC3C33C3C3C3CC3C3, + 0X0F0F0F0FF0F0F0F0, + 0XAA55AA55AA55AA55, + 0X33CCCC33CC3333CC, + 0XF0F0F0F0F0F0F0F0, + 0XA55A5AA55AA5A55A, +}, +{ + 0X9966996666996699, + 0X33CCCC33CC3333CC, + 0X5A5A5A5AA5A5A5A5, + 0XC3C33C3C3C3CC3C3, + 0XF00FF00F0FF00FF0, + 0X55AA55AA55AA55AA, + 0XC3C33C3C3C3CC3C3, + 0XF0F0F0F00F0F0F0F, + 0X55AA55AA55AA55AA, + 0XCC3333CC33CCCC33, + 0X0F0F0F0F0F0F0F0F, + 0XA55A5AA55AA5A55A, +}, +{ + 0X6699669999669966, + 0XCC3333CC33CCCC33, + 0X5A5A5A5AA5A5A5A5, + 0X3C3CC3C3C3C33C3C, + 0X0FF00FF0F00FF00F, + 0X55AA55AA55AA55AA, + 0X3C3CC3C3C3C33C3C, + 0X0F0F0F0FF0F0F0F0, + 0X55AA55AA55AA55AA, + 0X33CCCC33CC3333CC, + 0XF0F0F0F0F0F0F0F0, + 0XA55A5AA55AA5A55A, +}, +{ + 0X9966996666996699, + 0XCC3333CC33CCCC33, + 0X5A5A5A5AA5A5A5A5, + 0X3C3CC3C3C3C33C3C, + 0XF00FF00F0FF00FF0, + 0XAA55AA55AA55AA55, + 0X3C3CC3C3C3C33C3C, + 0XF0F0F0F00F0F0F0F, + 0XAA55AA55AA55AA55, + 0XCC3333CC33CCCC33, + 0X0F0F0F0F0F0F0F0F, + 0XA55A5AA55AA5A55A, +}, +{ + 0X6699669999669966, + 0XCC3333CC33CCCC33, + 0XA5A5A5A55A5A5A5A, + 0XC3C33C3C3C3CC3C3, + 0XF00FF00F0FF00FF0, + 0XAA55AA55AA55AA55, + 0XC3C33C3C3C3CC3C3, + 0X0F0F0F0FF0F0F0F0, + 0XAA55AA55AA55AA55, + 0X33CCCC33CC3333CC, + 0XF0F0F0F0F0F0F0F0, + 0XA55A5AA55AA5A55A, +}, +{ + 0X9966996666996699, + 0XCC3333CC33CCCC33, + 0XA5A5A5A55A5A5A5A, + 0XC3C33C3C3C3CC3C3, + 0X0FF00FF0F00FF00F, + 0X55AA55AA55AA55AA, + 0XC3C33C3C3C3CC3C3, + 0XF0F0F0F00F0F0F0F, + 0X55AA55AA55AA55AA, + 0XCC3333CC33CCCC33, + 0X0F0F0F0F0F0F0F0F, + 0XA55A5AA55AA5A55A, +}, +//1024 +{ + 0X9669699696696996, + 0X6996699669966996, + 0X6996699669966996, + 0X00FFFF0000FFFF00, + 0XFF00FF00FF00FF00, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X6996699669966996, + 0X00FFFF0000FFFF00, + 0X00FF00FF00FF00FF, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X6996699669966996, + 0XFF0000FFFF0000FF, + 0X00FF00FF00FF00FF, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X6996699669966996, + 0XFF0000FFFF0000FF, + 0XFF00FF00FF00FF00, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X9669966996699669, + 0XFF0000FFFF0000FF, + 0X00FF00FF00FF00FF, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X9669966996699669, + 0XFF0000FFFF0000FF, + 0XFF00FF00FF00FF00, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X9669966996699669, + 0X00FFFF0000FFFF00, + 0XFF00FF00FF00FF00, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X9669966996699669, + 0X00FFFF0000FFFF00, + 0X00FF00FF00FF00FF, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X9669966996699669, + 0X00FFFF0000FFFF00, + 0XFF00FF00FF00FF00, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X9669966996699669, + 0X00FFFF0000FFFF00, + 0X00FF00FF00FF00FF, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X9669966996699669, + 0XFF0000FFFF0000FF, + 0X00FF00FF00FF00FF, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X9669966996699669, + 0XFF0000FFFF0000FF, + 0XFF00FF00FF00FF00, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X6996699669966996, + 0XFF0000FFFF0000FF, + 0X00FF00FF00FF00FF, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X6996699669966996, + 0XFF0000FFFF0000FF, + 0XFF00FF00FF00FF00, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X6996699669966996, + 0X00FFFF0000FFFF00, + 0XFF00FF00FF00FF00, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X6996699669966996, + 0X00FFFF0000FFFF00, + 0X00FF00FF00FF00FF, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +//2048 +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +} diff --git a/crypto_kem/mceliece348864/vec/controlbits.c b/crypto_kem/mceliece348864/vec/controlbits.c new file mode 100644 index 00000000..3c43028f --- /dev/null +++ b/crypto_kem/mceliece348864/vec/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE348864_VEC_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE348864_VEC_sort_63b(n / 2, x); + PQCLEAN_MCELIECE348864_VEC_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE348864_VEC_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece348864/vec/controlbits.h b/crypto_kem/mceliece348864/vec/controlbits.h new file mode 100644 index 00000000..cce484fa --- /dev/null +++ b/crypto_kem/mceliece348864/vec/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_CONTROLBITS_H +#define PQCLEAN_MCELIECE348864_VEC_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE348864_VEC_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE348864_VEC_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece348864/vec/crypto_hash.h b/crypto_kem/mceliece348864/vec/crypto_hash.h new file mode 100644 index 00000000..b094cb38 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE348864_VEC_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece348864/vec/decrypt.c b/crypto_kem/mceliece348864/vec/decrypt.c new file mode 100644 index 00000000..f9a8ec16 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/decrypt.c @@ -0,0 +1,191 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" +#include "vec.h" + +#include + +static void scaling(vec out[][GFBITS], vec inv[][GFBITS], const unsigned char *sk, const vec *recv) { + int i, j; + + vec irr_int[ GFBITS ]; + vec eval[64][ GFBITS ]; + vec tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE348864_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE348864_VEC_fft(eval, irr_int); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE348864_VEC_vec_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE348864_VEC_vec_copy(inv[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE348864_VEC_vec_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864_VEC_vec_inv(tmp, inv[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE348864_VEC_vec_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE348864_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864_VEC_vec_copy(inv[0], tmp); + + // + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static void preprocess(vec *recv, const unsigned char *s) { + int i; + unsigned char r[ 512 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 512; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE348864_VEC_load8(r + i * 8); + } +} + +static void postprocess(unsigned char *e, vec *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE348864_VEC_store8(error8 + i * 8, err[i]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec out[][GFBITS], vec inv[][GFBITS], const vec *recv) { + int i, j; + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static uint16_t weight_check(const unsigned char *e, const vec *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < (1 << GFBITS); i++) { + w0 += (error[i / 64] >> (i % 64)) & 1; + } + + for (i = 0; i < SYS_N; i++) { + w1 += (e[i / 8] >> (i % 8)) & 1; + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec s0[][ GFBITS ], vec s1[][ GFBITS ]) { + int i, j; + vec diff = 0; + + for (i = 0; i < 2; i++) { + for (j = 0; j < GFBITS; j++) { + diff |= (s0[i][j] ^ s1[i][j]); + } + } + + return (uint16_t)PQCLEAN_MCELIECE348864_VEC_vec_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE348864_VEC_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec inv[ 64 ][ GFBITS ]; + vec scaled[ 64 ][ GFBITS ]; + vec eval[ 64 ][ GFBITS ]; + + vec error[ 64 ]; + + vec s_priv[ 2 ][ GFBITS ]; + vec s_priv_cmp[ 2 ][ GFBITS ]; + vec locator[ GFBITS ]; + + vec recv[ 64 ]; + vec allone; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE348864_VEC_benes(recv, sk + IRR_BYTES, 1); + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE348864_VEC_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE348864_VEC_bm(locator, s_priv); + + PQCLEAN_MCELIECE348864_VEC_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE348864_VEC_vec_setbits(1); + + for (i = 0; i < 64; i++) { + error[i] = PQCLEAN_MCELIECE348864_VEC_vec_or_reduce(eval[i]); + error[i] ^= allone; + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE348864_VEC_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE348864_VEC_benes(error, sk + IRR_BYTES, 0); + + postprocess(e, error); + + check_weight = weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece348864/vec/decrypt.h b/crypto_kem/mceliece348864/vec/decrypt.h new file mode 100644 index 00000000..c2f560a0 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_DECRYPT_H +#define PQCLEAN_MCELIECE348864_VEC_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE348864_VEC_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece348864/vec/encrypt.c b/crypto_kem/mceliece348864/vec/encrypt.c new file mode 100644 index 00000000..4ee27dda --- /dev/null +++ b/crypto_kem/mceliece348864/vec/encrypt.c @@ -0,0 +1,135 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "gf.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint8_t *ind_8 = (uint8_t *)ind_; + uint16_t ind[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes(ind_8, sizeof(ind_)); + for (i = 0; i < sizeof(ind_); i += 2) { + ind_[i / 2] = (uint16_t)ind_8[i + 1] << 8 | ind_8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE348864_VEC_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + uint64_t b; + + const uint8_t *e_ptr8 = e + SYND_BYTES; + const uint8_t *pk_ptr8; + + int i, j; + + // + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = e[i]; + } + + for (i = 0; i < PK_NROWS; i++) { + pk_ptr8 = pk + PK_ROW_BYTES * i; + + b = 0; + for (j = 0; j < PK_NCOLS / 64; j++) { + b ^= PQCLEAN_MCELIECE348864_VEC_load8(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE348864_VEC_load8(e_ptr8 + 8 * j); + } + + b ^= PQCLEAN_MCELIECE348864_VEC_load4(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE348864_VEC_load4(e_ptr8 + 8 * j); + + b ^= b >> 32; + b ^= b >> 16; + b ^= b >> 8; + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] ^= (b << (i % 8)); + } +} + +void PQCLEAN_MCELIECE348864_VEC_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece348864/vec/encrypt.h b/crypto_kem/mceliece348864/vec/encrypt.h new file mode 100644 index 00000000..0fb7ab8f --- /dev/null +++ b/crypto_kem/mceliece348864/vec/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_ENCRYPT_H +#define PQCLEAN_MCELIECE348864_VEC_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE348864_VEC_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece348864/vec/fft.c b/crypto_kem/mceliece348864/vec/fft.c new file mode 100644 index 00000000..3eb33de4 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/fft.c @@ -0,0 +1,113 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" +#include "vec.h" + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(uint64_t *in) { + int i, j, k; + + const uint64_t mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const uint64_t s[5][GFBITS] = { +#include "scalars.inc" + }; + + // + + for (j = 0; j <= 4; j++) { + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[i] ^= (in[i] & mask[k][0]) >> (1 << k); + in[i] ^= (in[i] & mask[k][1]) >> (1 << k); + } + } + + PQCLEAN_MCELIECE348864_VEC_vec_mul(in, in, s[j]); // scaling + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(uint64_t out[][ GFBITS ], const uint64_t *in) { + int i, j, k, s, b; + + uint64_t tmp[ GFBITS ]; + uint64_t consts[ 63 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec powers[ 64 ][ GFBITS ] = { +#include "powers.inc" + }; + + uint64_t consts_ptr = 0; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + // boradcast + + for (j = 0; j < 64; j++) { + for (i = 0; i < GFBITS; i++) { + out[j][i] = (in[i] >> reversal[j]) & 1; + out[j][i] = -out[j][i]; + } + } + + // butterflies + + for (i = 0; i <= 5; i++) { + s = 1 << i; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE348864_VEC_vec_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += ((uint64_t)1 << i); + } + + // + + // adding the part contributed by x^64 + + for (i = 0; i < 64; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] ^= powers[i][b]; + } + } + +} + +void PQCLEAN_MCELIECE348864_VEC_fft(vec out[][ GFBITS ], vec *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece348864/vec/fft.h b/crypto_kem/mceliece348864/vec/fft.h new file mode 100644 index 00000000..5d28c208 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/fft.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_FFT_H +#define PQCLEAN_MCELIECE348864_VEC_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "params.h" +#include "vec.h" +#include + +void PQCLEAN_MCELIECE348864_VEC_fft(vec /*out*/[][ GFBITS ], vec * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece348864/vec/fft_tr.c b/crypto_kem/mceliece348864/vec/fft_tr.c new file mode 100644 index 00000000..71081cbf --- /dev/null +++ b/crypto_kem/mceliece348864/vec/fft_tr.c @@ -0,0 +1,268 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" +#include "vec.h" + +#include + +#define vec_add(z, x, y) \ + for (b = 0; b < GFBITS; b++) { \ + (z)[b] = (x)[b] ^ (y)[b]; \ + } + +static inline void radix_conversions_tr(uint64_t in[][ GFBITS ]) { + int i, j, k; + + const uint64_t mask[6][2] = { + {0x2222222222222222, 0x4444444444444444}, + {0x0C0C0C0C0C0C0C0C, 0x3030303030303030}, + {0x00F000F000F000F0, 0x0F000F000F000F00}, + {0x0000FF000000FF00, 0x00FF000000FF0000}, + {0x00000000FFFF0000, 0x0000FFFF00000000}, + {0xFFFFFFFF00000000, 0x00000000FFFFFFFF} + }; + + const uint64_t s[5][2][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 5; j >= 0; j--) { + if (j < 5) { + PQCLEAN_MCELIECE348864_VEC_vec_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE348864_VEC_vec_mul(in[1], in[1], s[j][1]); // scaling + } + + for (i = 0; i < GFBITS; i++) { + for (k = j; k <= 4; k++) { + in[0][i] ^= (in[0][i] & mask[k][0]) << (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) << (1 << k); + + in[1][i] ^= (in[1][i] & mask[k][0]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) << (1 << k); + } + } + + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= (in[0][i] & mask[5][0]) >> 32; + in[1][i] ^= (in[1][i] & mask[5][1]) << 32; + } + } +} + +static inline void butterflies_tr(uint64_t out[][ GFBITS ], uint64_t in[][ GFBITS ]) { + int i, j, k, s, b; + + uint64_t tmp[ GFBITS ]; + uint64_t pre[6][ GFBITS ]; + uint64_t buf[64]; + + const uint64_t consts[ 63 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 63; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {8, 1300, 3408, 1354, 2341, 1154}; + + // butterflies + + for (i = 5; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + vec_add(in[k], in[k], in[k + s]); + PQCLEAN_MCELIECE348864_VEC_vec_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + vec_add(in[k + s], in[k + s], tmp); + } + } + } + + // transpose + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < 64; j++) { + buf[ reversal[j] ] = in[j][i]; + } + + PQCLEAN_MCELIECE348864_VEC_transpose_64x64(buf, buf); + + for (j = 0; j < 64; j++) { + in[j][i] = buf[ j ]; + } + } + + // boradcast + + PQCLEAN_MCELIECE348864_VEC_vec_copy(pre[0], in[32]); + vec_add(in[33], in[33], in[32]); + PQCLEAN_MCELIECE348864_VEC_vec_copy(pre[1], in[33]); + vec_add(in[35], in[35], in[33]); + vec_add(pre[0], pre[0], in[35]); + vec_add(in[34], in[34], in[35]); + PQCLEAN_MCELIECE348864_VEC_vec_copy(pre[2], in[34]); + vec_add(in[38], in[38], in[34]); + vec_add(pre[0], pre[0], in[38]); + vec_add(in[39], in[39], in[38]); + vec_add(pre[1], pre[1], in[39]); + vec_add(in[37], in[37], in[39]); + vec_add(pre[0], pre[0], in[37]); + vec_add(in[36], in[36], in[37]); + PQCLEAN_MCELIECE348864_VEC_vec_copy(pre[3], in[36]); + vec_add(in[44], in[44], in[36]); + vec_add(pre[0], pre[0], in[44]); + vec_add(in[45], in[45], in[44]); + vec_add(pre[1], pre[1], in[45]); + vec_add(in[47], in[47], in[45]); + vec_add(pre[0], pre[0], in[47]); + vec_add(in[46], in[46], in[47]); + vec_add(pre[2], pre[2], in[46]); + vec_add(in[42], in[42], in[46]); + vec_add(pre[0], pre[0], in[42]); + vec_add(in[43], in[43], in[42]); + vec_add(pre[1], pre[1], in[43]); + vec_add(in[41], in[41], in[43]); + vec_add(pre[0], pre[0], in[41]); + vec_add(in[40], in[40], in[41]); + PQCLEAN_MCELIECE348864_VEC_vec_copy(pre[4], in[40]); + vec_add(in[56], in[56], in[40]); + vec_add(pre[0], pre[0], in[56]); + vec_add(in[57], in[57], in[56]); + vec_add(pre[1], pre[1], in[57]); + vec_add(in[59], in[59], in[57]); + vec_add(pre[0], pre[0], in[59]); + vec_add(in[58], in[58], in[59]); + vec_add(pre[2], pre[2], in[58]); + vec_add(in[62], in[62], in[58]); + vec_add(pre[0], pre[0], in[62]); + vec_add(in[63], in[63], in[62]); + vec_add(pre[1], pre[1], in[63]); + vec_add(in[61], in[61], in[63]); + vec_add(pre[0], pre[0], in[61]); + vec_add(in[60], in[60], in[61]); + vec_add(pre[3], pre[3], in[60]); + vec_add(in[52], in[52], in[60]); + vec_add(pre[0], pre[0], in[52]); + vec_add(in[53], in[53], in[52]); + vec_add(pre[1], pre[1], in[53]); + vec_add(in[55], in[55], in[53]); + vec_add(pre[0], pre[0], in[55]); + vec_add(in[54], in[54], in[55]); + vec_add(pre[2], pre[2], in[54]); + vec_add(in[50], in[50], in[54]); + vec_add(pre[0], pre[0], in[50]); + vec_add(in[51], in[51], in[50]); + vec_add(pre[1], pre[1], in[51]); + vec_add(in[49], in[49], in[51]); + vec_add(pre[0], pre[0], in[49]); + vec_add(in[48], in[48], in[49]); + PQCLEAN_MCELIECE348864_VEC_vec_copy(pre[5], in[48]); + vec_add(in[16], in[16], in[48]); + vec_add(pre[0], pre[0], in[16]); + vec_add(in[17], in[17], in[16]); + vec_add(pre[1], pre[1], in[17]); + vec_add(in[19], in[19], in[17]); + vec_add(pre[0], pre[0], in[19]); + vec_add(in[18], in[18], in[19]); + vec_add(pre[2], pre[2], in[18]); + vec_add(in[22], in[22], in[18]); + vec_add(pre[0], pre[0], in[22]); + vec_add(in[23], in[23], in[22]); + vec_add(pre[1], pre[1], in[23]); + vec_add(in[21], in[21], in[23]); + vec_add(pre[0], pre[0], in[21]); + vec_add(in[20], in[20], in[21]); + vec_add(pre[3], pre[3], in[20]); + vec_add(in[28], in[28], in[20]); + vec_add(pre[0], pre[0], in[28]); + vec_add(in[29], in[29], in[28]); + vec_add(pre[1], pre[1], in[29]); + vec_add(in[31], in[31], in[29]); + vec_add(pre[0], pre[0], in[31]); + vec_add(in[30], in[30], in[31]); + vec_add(pre[2], pre[2], in[30]); + vec_add(in[26], in[26], in[30]); + vec_add(pre[0], pre[0], in[26]); + vec_add(in[27], in[27], in[26]); + vec_add(pre[1], pre[1], in[27]); + vec_add(in[25], in[25], in[27]); + vec_add(pre[0], pre[0], in[25]); + vec_add(in[24], in[24], in[25]); + vec_add(pre[4], pre[4], in[24]); + vec_add(in[8], in[8], in[24]); + vec_add(pre[0], pre[0], in[8]); + vec_add(in[9], in[9], in[8]); + vec_add(pre[1], pre[1], in[9]); + vec_add(in[11], in[11], in[9]); + vec_add(pre[0], pre[0], in[11]); + vec_add(in[10], in[10], in[11]); + vec_add(pre[2], pre[2], in[10]); + vec_add(in[14], in[14], in[10]); + vec_add(pre[0], pre[0], in[14]); + vec_add(in[15], in[15], in[14]); + vec_add(pre[1], pre[1], in[15]); + vec_add(in[13], in[13], in[15]); + vec_add(pre[0], pre[0], in[13]); + vec_add(in[12], in[12], in[13]); + vec_add(pre[3], pre[3], in[12]); + vec_add(in[4], in[4], in[12]); + vec_add(pre[0], pre[0], in[4]); + vec_add(in[5], in[5], in[4]); + vec_add(pre[1], pre[1], in[5]); + vec_add(in[7], in[7], in[5]); + vec_add(pre[0], pre[0], in[7]); + vec_add(in[6], in[6], in[7]); + vec_add(pre[2], pre[2], in[6]); + vec_add(in[2], in[2], in[6]); + vec_add(pre[0], pre[0], in[2]); + vec_add(in[3], in[3], in[2]); + vec_add(pre[1], pre[1], in[3]); + vec_add(in[1], in[1], in[3]); + + vec_add(pre[0], pre[0], in[1]); + vec_add(out[0], in[0], in[1]); + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = (beta[0] >> j) & 1; + tmp[j] = -tmp[j]; + } + + PQCLEAN_MCELIECE348864_VEC_vec_mul(out[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = (beta[i] >> j) & 1; + tmp[j] = -tmp[j]; + } + + PQCLEAN_MCELIECE348864_VEC_vec_mul(tmp, pre[i], tmp); + vec_add(out[1], out[1], tmp); + } +} + +void PQCLEAN_MCELIECE348864_VEC_fft_tr(vec out[][GFBITS], vec in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece348864/vec/fft_tr.h b/crypto_kem/mceliece348864/vec/fft_tr.h new file mode 100644 index 00000000..44139f36 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_FFT_TR_H +#define PQCLEAN_MCELIECE348864_VEC_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE348864_VEC_fft_tr(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece348864/vec/gf.c b/crypto_kem/mceliece348864/vec/gf.c new file mode 100644 index 00000000..dff87128 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/gf.c @@ -0,0 +1,169 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE348864_VEC_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 20; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE348864_VEC_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE348864_VEC_gf_mul(gf in0, gf in1) { + int i; + + uint32_t tmp; + uint32_t t0; + uint32_t t1; + uint32_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & (1 << i))); + } + + t = tmp & 0x7FC000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + t = tmp & 0x3000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + return tmp & ((1 << GFBITS) - 1); +} + +/* input: field element in */ +/* return: in^2 */ +static inline gf gf_sq(gf in) { + const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; + + uint32_t x = in; + uint32_t t; + + x = (x | (x << 8)) & B[3]; + x = (x | (x << 4)) & B[2]; + x = (x | (x << 2)) & B[1]; + x = (x | (x << 1)) & B[0]; + + t = x & 0x7FC000; + x ^= t >> 9; + x ^= t >> 12; + + t = x & 0x3000; + x ^= t >> 9; + x ^= t >> 12; + + return x & ((1 << GFBITS) - 1); +} + +gf PQCLEAN_MCELIECE348864_VEC_gf_inv(gf in) { + gf tmp_11; + gf tmp_1111; + + gf out = in; + + out = gf_sq(out); + tmp_11 = PQCLEAN_MCELIECE348864_VEC_gf_mul(out, in); // 11 + + out = gf_sq(tmp_11); + out = gf_sq(out); + tmp_1111 = PQCLEAN_MCELIECE348864_VEC_gf_mul(out, tmp_11); // 1111 + + out = gf_sq(tmp_1111); + out = gf_sq(out); + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_VEC_gf_mul(out, tmp_1111); // 11111111 + + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_VEC_gf_mul(out, tmp_11); // 1111111111 + + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_VEC_gf_mul(out, in); // 11111111111 + + return gf_sq(out); // 111111111110 +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE348864_VEC_gf_frac(gf den, gf num) { + return PQCLEAN_MCELIECE348864_VEC_gf_mul(PQCLEAN_MCELIECE348864_VEC_gf_inv(den), num); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE348864_VEC_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE348864_VEC_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864_VEC_gf_mul(prod[i], (gf) 877); + prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864_VEC_gf_mul(prod[i], (gf) 2888); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864_VEC_gf_mul(prod[i], (gf) 1781); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864_VEC_gf_mul(prod[i], (gf) 373); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864_VEC_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x007FC000007FC000; + tmp ^= (t >> 9) ^ (t >> 12); + + t = tmp & 0x0000300000003000; + tmp ^= (t >> 9) ^ (t >> 12); + + return tmp & 0x00000FFF00000FFF; +} + diff --git a/crypto_kem/mceliece348864/vec/gf.h b/crypto_kem/mceliece348864/vec/gf.h new file mode 100644 index 00000000..65c6c374 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/gf.h @@ -0,0 +1,26 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_GF_H +#define PQCLEAN_MCELIECE348864_VEC_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE348864_VEC_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE348864_VEC_gf_add(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864_VEC_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864_VEC_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE348864_VEC_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE348864_VEC_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864_VEC_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/crypto_kem/mceliece348864/vec/operations.c b/crypto_kem/mceliece348864/vec/operations.c new file mode 100644 index 00000000..168620ca --- /dev/null +++ b/crypto_kem/mceliece348864/vec/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE348864_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE348864_VEC_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864_VEC_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE348864_VEC_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE348864_VEC_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE348864_VEC_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE348864_VEC_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE348864_VEC_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE348864_VEC_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE348864_VEC_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE348864_VEC_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864/vec/params.h b/crypto_kem/mceliece348864/vec/params.h new file mode 100644 index 00000000..cd1a0819 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_PARAMS_H +#define PQCLEAN_MCELIECE348864_VEC_PARAMS_H + +#define GFBITS 12 +#define SYS_N 3488 +#define SYS_T 64 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece348864/vec/pk_gen.c b/crypto_kem/mceliece348864/vec/pk_gen.c new file mode 100644 index 00000000..9b50c0f3 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/pk_gen.c @@ -0,0 +1,238 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" +#include "vec.h" + +#include + +static void de_bitslicing(uint64_t *out, vec in[][GFBITS]) { + int i, j, r; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 0; r < 64; r++) { + out[i * 64 + r] <<= 1; + out[i * 64 + r] |= (in[i][j] >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec out0[][GFBITS], vec out1[][GFBITS], const uint64_t *in) { + int i, j, r; + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out1[i][j] <<= 1; + out1[i][j] |= (in[i * 64 + r] >> (j + GFBITS)) & 1; + } + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out0[i][GFBITS - 1 - j] <<= 1; + out0[i][GFBITS - 1 - j] |= (in[i * 64 + r] >> j) & 1; + } + } + } +} + +#define NBLOCKS_H ((SYS_N + 63) / 64) +#define NBLOCKS_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE348864_VEC_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS_I; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS_H ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS_I ]; + + uint64_t mask; + + uint64_t irr_int[ GFBITS ]; + + vec consts[64][ GFBITS ]; + vec eval[ 64 ][ GFBITS ]; + vec prod[ 64 ][ GFBITS ]; + vec tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 64 ]; + + // compute the inverses + + PQCLEAN_MCELIECE348864_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE348864_VEC_fft(eval, irr_int); + + PQCLEAN_MCELIECE348864_VEC_vec_copy(prod[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE348864_VEC_vec_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864_VEC_vec_inv(tmp, prod[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE348864_VEC_vec_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE348864_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864_VEC_vec_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE348864_VEC_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS_I; j++) { + PQCLEAN_MCELIECE348864_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS_I; j < NBLOCKS_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS_I; j < NBLOCKS_H; j++) { + PQCLEAN_MCELIECE348864_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + for (row = 0; row < PK_NROWS; row++) { + + for (k = 0; k < NBLOCKS_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS_H - 1; k++) { + PQCLEAN_MCELIECE348864_VEC_store8(pk, one_row[k]); + pk += 8; + } + + PQCLEAN_MCELIECE348864_VEC_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece348864/vec/pk_gen.h b/crypto_kem/mceliece348864/vec/pk_gen.h new file mode 100644 index 00000000..c577a9b2 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_PK_GEN_H +#define PQCLEAN_MCELIECE348864_VEC_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE348864_VEC_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece348864/vec/powers.inc b/crypto_kem/mceliece348864/vec/powers.inc new file mode 100644 index 00000000..a37fb2bd --- /dev/null +++ b/crypto_kem/mceliece348864/vec/powers.inc @@ -0,0 +1,896 @@ +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +} diff --git a/crypto_kem/mceliece348864/vec/scalars.inc b/crypto_kem/mceliece348864/vec/scalars.inc new file mode 100644 index 00000000..aa8f64b9 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/scalars.inc @@ -0,0 +1,70 @@ +{ + 0XF3CFC030FC30F003, + 0X3FCF0F003C00C00C, + 0X30033CC300C0C03C, + 0XCCFF0F3C0F30F0C0, + 0X0300C03FF303C3F0, + 0X3FFF3C0FF0CCCCC0, + 0XF3FFF0C00F3C3CC0, + 0X3003333FFFC3C000, + 0X0FF30FFFC3FFF300, + 0XFFC0F300F0F0CC00, + 0XC0CFF3FCCC3CFC00, + 0XFC3C03F0F330C000, +}, +{ + 0X000F00000000F00F, + 0X00000F00F00000F0, + 0X0F00000F00000F00, + 0XF00F00F00F000000, + 0X00F00000000000F0, + 0X0000000F00000000, + 0XF00000000F00F000, + 0X00F00F00000F0000, + 0X0000F00000F00F00, + 0X000F00F00F00F000, + 0X00F00F0000000000, + 0X0000000000F00000, +}, +{ + 0X0000FF00FF0000FF, + 0X0000FF000000FF00, + 0XFF0000FF00FF0000, + 0XFFFF0000FF000000, + 0X00FF00FF00FF0000, + 0X0000FFFFFF000000, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF0000, + 0XFFFF00FFFF00FF00, + 0X0000FF0000000000, + 0XFFFFFF00FF000000, + 0X00FF000000000000, +}, +{ + 0X000000000000FFFF, + 0X00000000FFFF0000, + 0X0000000000000000, + 0XFFFF000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +}, +{ + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +} diff --git a/crypto_kem/mceliece348864/vec/scalars_2x.inc b/crypto_kem/mceliece348864/vec/scalars_2x.inc new file mode 100644 index 00000000..e7c7fee5 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/scalars_2x.inc @@ -0,0 +1,140 @@ +{{ + 0XF3CFC030FC30F003, + 0X3FCF0F003C00C00C, + 0X30033CC300C0C03C, + 0XCCFF0F3C0F30F0C0, + 0X0300C03FF303C3F0, + 0X3FFF3C0FF0CCCCC0, + 0XF3FFF0C00F3C3CC0, + 0X3003333FFFC3C000, + 0X0FF30FFFC3FFF300, + 0XFFC0F300F0F0CC00, + 0XC0CFF3FCCC3CFC00, + 0XFC3C03F0F330C000, +}, +{ + 0X000C03C0C3C0330C, + 0XF330CFFCC00F33C0, + 0XCCF330F00F3C0333, + 0XFF03FFF3FF0CF0C0, + 0X3CC3FCF00FCC303C, + 0X0F000C0FC30303F3, + 0XCF0FC3FF333CCF3C, + 0X003F3FC3C0FF333F, + 0X3CC3F0F3CF0FF00F, + 0XF3F33CC03FC30CC0, + 0X3CC330CFC333F33F, + 0X3CC0303FF3C3FFFC, +}}, +{{ + 0X000F00000000F00F, + 0X00000F00F00000F0, + 0X0F00000F00000F00, + 0XF00F00F00F000000, + 0X00F00000000000F0, + 0X0000000F00000000, + 0XF00000000F00F000, + 0X00F00F00000F0000, + 0X0000F00000F00F00, + 0X000F00F00F00F000, + 0X00F00F0000000000, + 0X0000000000F00000, +}, +{ + 0X0F00F00F00000000, + 0XF00000000000F000, + 0X00000F00000000F0, + 0X0F00F00000F00000, + 0X000F00000F00F00F, + 0X00F00F00F00F0000, + 0X0F00F00000000000, + 0X000000000F000000, + 0X00F00000000F00F0, + 0X0000F00F00000F00, + 0XF00000F00000F00F, + 0X00000F00F00F00F0, +}}, +{{ + 0X0000FF00FF0000FF, + 0X0000FF000000FF00, + 0XFF0000FF00FF0000, + 0XFFFF0000FF000000, + 0X00FF00FF00FF0000, + 0X0000FFFFFF000000, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF0000, + 0XFFFF00FFFF00FF00, + 0X0000FF0000000000, + 0XFFFFFF00FF000000, + 0X00FF000000000000, +}, +{ + 0XFF00FFFFFF000000, + 0XFF0000FFFF000000, + 0XFFFF00FFFF000000, + 0XFF00FFFFFFFFFF00, + 0X00000000FF00FF00, + 0XFFFFFFFF00FF0000, + 0X00FFFFFF00FF0000, + 0XFFFF00FFFF00FFFF, + 0XFFFF0000FFFFFFFF, + 0XFF00000000FF0000, + 0X000000FF00FF00FF, + 0X00FF00FF00FFFF00, +}}, +{{ + 0X000000000000FFFF, + 0X00000000FFFF0000, + 0X0000000000000000, + 0XFFFF000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +}, +{ + 0X0000000000000000, + 0XFFFF000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFF00000000FFFF, + 0X0000000000000000, + 0X0000FFFF00000000, + 0XFFFF00000000FFFF, + 0X00000000FFFF0000, + 0X0000000000000000, + 0XFFFF00000000FFFF, + 0X00000000FFFF0000, +}}, +{{ + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, +}} diff --git a/crypto_kem/mceliece348864/vec/sk_gen.c b/crypto_kem/mceliece348864/vec/sk_gen.c new file mode 100644 index 00000000..b82db7cb --- /dev/null +++ b/crypto_kem/mceliece348864/vec/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE348864_VEC_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE348864_VEC_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE348864_VEC_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE348864_VEC_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE348864_VEC_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864_VEC_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE348864_VEC_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE348864_VEC_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864/vec/sk_gen.h b/crypto_kem/mceliece348864/vec/sk_gen.h new file mode 100644 index 00000000..aad2d4d1 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_SK_GEN_H +#define PQCLEAN_MCELIECE348864_VEC_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE348864_VEC_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE348864_VEC_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece348864/vec/transpose.c b/crypto_kem/mceliece348864/vec/transpose.c new file mode 100644 index 00000000..bc2d9d70 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/transpose.c @@ -0,0 +1,35 @@ +#include "transpose.h" + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE348864_VEC_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} diff --git a/crypto_kem/mceliece348864/vec/transpose.h b/crypto_kem/mceliece348864/vec/transpose.h new file mode 100644 index 00000000..02b7a9c4 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/transpose.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_TRANSPOSE_H +#define PQCLEAN_MCELIECE348864_VEC_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + + +void PQCLEAN_MCELIECE348864_VEC_transpose_64x64(uint64_t *out, const uint64_t *in); + +#endif + diff --git a/crypto_kem/mceliece348864/vec/util.c b/crypto_kem/mceliece348864/vec/util.c new file mode 100644 index 00000000..5eb164b6 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/util.c @@ -0,0 +1,94 @@ +#include "util.h" + + +void PQCLEAN_MCELIECE348864_VEC_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864_VEC_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE348864_VEC_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE348864_VEC_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE348864_VEC_irr_load(uint64_t *out, const unsigned char *in) { + int i, j; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE348864_VEC_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + out[i] = 0; + } + + for (i = SYS_T; i >= 0; i--) { + for (j = 0; j < GFBITS; j++) { + out[j] <<= 1; + out[j] |= (irr[i] >> j) & 1; + } + } +} + +void PQCLEAN_MCELIECE348864_VEC_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE348864_VEC_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE348864_VEC_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 4; +} diff --git a/crypto_kem/mceliece348864/vec/util.h b/crypto_kem/mceliece348864/vec/util.h new file mode 100644 index 00000000..38ddc754 --- /dev/null +++ b/crypto_kem/mceliece348864/vec/util.h @@ -0,0 +1,29 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_UTIL_H +#define PQCLEAN_MCELIECE348864_VEC_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" + +#include + + +void PQCLEAN_MCELIECE348864_VEC_store_i(unsigned char *out, uint64_t in, int i); + +void PQCLEAN_MCELIECE348864_VEC_store2(unsigned char *dest, gf a); + +uint16_t PQCLEAN_MCELIECE348864_VEC_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE348864_VEC_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE348864_VEC_irr_load(uint64_t *out, const unsigned char *in); + +void PQCLEAN_MCELIECE348864_VEC_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE348864_VEC_load8(const unsigned char *in); + +gf PQCLEAN_MCELIECE348864_VEC_bitrev(gf a); + +#endif diff --git a/crypto_kem/mceliece348864/vec/vec.c b/crypto_kem/mceliece348864/vec/vec.c new file mode 100644 index 00000000..9dbab9ad --- /dev/null +++ b/crypto_kem/mceliece348864/vec/vec.c @@ -0,0 +1,131 @@ +#include "vec.h" + +#include "params.h" + +vec PQCLEAN_MCELIECE348864_VEC_vec_setbits(vec b) { + vec ret = -b; + + return ret; +} + +vec PQCLEAN_MCELIECE348864_VEC_vec_set1_16b(uint16_t v) { + vec ret; + + ret = v; + ret |= ret << 16; + ret |= ret << 32; + + return ret; +} + +void PQCLEAN_MCELIECE348864_VEC_vec_copy(vec *out, const vec *in) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i] = in[i]; + } +} + +vec PQCLEAN_MCELIECE348864_VEC_vec_or_reduce(const vec *a) { + int i; + vec ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret |= a[i]; + } + + return ret; +} + +int PQCLEAN_MCELIECE348864_VEC_vec_testz(vec a) { + a |= a >> 32; + a |= a >> 16; + a |= a >> 8; + a |= a >> 4; + a |= a >> 2; + a |= a >> 1; + + return ((int)a & 1) ^ 1; +} + +void PQCLEAN_MCELIECE348864_VEC_vec_mul(vec *h, const vec *f, const vec *g) { + int i, j; + vec buf[ 2 * GFBITS - 1 ]; + + for (i = 0; i < 2 * GFBITS - 1; i++) { + buf[i] = 0; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < GFBITS; j++) { + buf[i + j] ^= f[i] & g[j]; + } + } + + for (i = 2 * GFBITS - 2; i >= GFBITS; i--) { + buf[i - GFBITS + 3] ^= buf[i]; + buf[i - GFBITS + 0] ^= buf[i]; + } + + for (i = 0; i < GFBITS; i++) { + h[i] = buf[i]; + } +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE348864_VEC_vec_sq(vec *out, const vec *in) { + int i; + uint64_t result[GFBITS]; + + // + + result[0] = in[0] ^ in[6]; + result[1] = in[11]; + result[2] = in[1] ^ in[7]; + result[3] = in[6]; + result[4] = in[2] ^ in[11] ^ in[8]; + result[5] = in[7]; + result[6] = in[3] ^ in[9]; + result[7] = in[8]; + result[8] = in[4] ^ in[10]; + result[9] = in[9]; + result[10] = in[5] ^ in[11]; + result[11] = in[10]; + + // + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE348864_VEC_vec_inv(vec *out, const vec *in) { + uint64_t tmp_11[GFBITS]; + uint64_t tmp_1111[GFBITS]; + + PQCLEAN_MCELIECE348864_VEC_vec_copy(out, in); + + PQCLEAN_MCELIECE348864_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864_VEC_vec_mul(tmp_11, out, in); // 11 + + PQCLEAN_MCELIECE348864_VEC_vec_sq(out, tmp_11); + PQCLEAN_MCELIECE348864_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864_VEC_vec_mul(tmp_1111, out, tmp_11); // 1111 + + PQCLEAN_MCELIECE348864_VEC_vec_sq(out, tmp_1111); + PQCLEAN_MCELIECE348864_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864_VEC_vec_mul(out, out, tmp_1111); // 11111111 + + PQCLEAN_MCELIECE348864_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864_VEC_vec_mul(out, out, tmp_11); // 1111111111 + + PQCLEAN_MCELIECE348864_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864_VEC_vec_mul(out, out, in); // 11111111111 + + PQCLEAN_MCELIECE348864_VEC_vec_sq(out, out); // 111111111110 +} diff --git a/crypto_kem/mceliece348864/vec/vec.h b/crypto_kem/mceliece348864/vec/vec.h new file mode 100644 index 00000000..808c69fc --- /dev/null +++ b/crypto_kem/mceliece348864/vec/vec.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE348864_VEC_VEC_H +#define PQCLEAN_MCELIECE348864_VEC_VEC_H + +#include "params.h" + +#include + +typedef uint64_t vec; + +vec PQCLEAN_MCELIECE348864_VEC_vec_setbits(vec b); + +vec PQCLEAN_MCELIECE348864_VEC_vec_set1_16b(uint16_t v); + +void PQCLEAN_MCELIECE348864_VEC_vec_copy(vec *out, const vec *in); + +vec PQCLEAN_MCELIECE348864_VEC_vec_or_reduce(const vec *a); + +int PQCLEAN_MCELIECE348864_VEC_vec_testz(vec a); + +void PQCLEAN_MCELIECE348864_VEC_vec_mul(vec * /*h*/, const vec * /*f*/, const vec * /*g*/); +void PQCLEAN_MCELIECE348864_VEC_vec_sq(vec * /*out*/, const vec * /*in*/); +void PQCLEAN_MCELIECE348864_VEC_vec_inv(vec * /*out*/, const vec * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/META.yml b/crypto_kem/mceliece348864f/META.yml new file mode 100644 index 00000000..88c290a9 --- /dev/null +++ b/crypto_kem/mceliece348864f/META.yml @@ -0,0 +1,50 @@ +name: Classic McEliece 348864f +type: kem +claimed-nist-level: 1 +claimed-security: IND-CCA2 +length-public-key: 261120 +length-secret-key: 6452 +length-ciphertext: 128 +length-shared-secret: 32 +nistkat-sha256: f0a166a9115a0c8481c85aee3fe901729a21a8a84a5d2b871fb99fc50223046b +principal-submitters: + - Daniel J. Bernstein + - Tung Chou + - Tanja Lange + - Ingo von Maurich + - Rafael Misoczki + - Ruben Niederhagen + - Edoardo Persichetti + - Christiane Peters + - Peter Schwabe + - Nicolas Sendrier + - Jakub Szefer + - Wen Wang +auxiliary-submitters: [] +implementations: + - name: clean + version: SUPERCOP-20191221 + - name: vec + version: SUPERCOP-20191221 + - name: sse + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - sse4_1 + - bmi1 + - popcnt + - name: avx + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - popcnt + - bmi2 diff --git a/crypto_kem/mceliece348864f/avx/LICENSE b/crypto_kem/mceliece348864f/avx/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece348864f/avx/Makefile b/crypto_kem/mceliece348864f/avx/Makefile new file mode 100644 index 00000000..63a4fc4c --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/Makefile @@ -0,0 +1,42 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece348864f_avx.a + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c int32_sort.c operations.c pk_gen.c sk_gen.c transpose.c \ + util.c uint32_sort.o vec.c vec128.c vec256.c \ + consts.S syndrome_asm.S transpose_64x256_sp_asm.S \ + transpose_64x64_asm.S update_asm.S vec128_mul_asm.S vec256_mul_asm.S \ + vec_mul_asm.S vec_mul_sp_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h int32_sort.h \ + params.h pk_gen.h sk_gen.h transpose.h uint32_sort.h util.h \ + vec128.h vec256.h vec.h \ + consts.inc powers.inc scalars_2x.inc scalars.inc + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o int32_sort.o operations.o pk_gen.o sk_gen.o transpose.o \ + util.o uint32_sort.o vec.o vec128.o vec256.o \ + consts.o syndrome_asm.o transpose_64x256_sp_asm.o \ + transpose_64x64_asm.o update_asm.o vec128_mul_asm.o vec256_mul_asm.o \ + vec_mul_asm.o vec_mul_sp_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mbmi -mpopcnt -mavx2 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece348864f/avx/aes256ctr.c b/crypto_kem/mceliece348864f/avx/aes256ctr.c new file mode 100644 index 00000000..4c62840c --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE348864F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece348864f/avx/aes256ctr.h b/crypto_kem/mceliece348864f/avx/aes256ctr.h new file mode 100644 index 00000000..d441eddd --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_AES256CTR_H +#define PQCLEAN_MCELIECE348864F_AVX_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE348864F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece348864f/avx/api.h b/crypto_kem/mceliece348864f/avx/api.h new file mode 100644 index 00000000..4dbdab6d --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/api.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_API_H +#define PQCLEAN_MCELIECE348864F_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE348864F_AVX_CRYPTO_ALGNAME "Classic McEliece 348864f" +#define PQCLEAN_MCELIECE348864F_AVX_CRYPTO_PUBLICKEYBYTES 261120 +#define PQCLEAN_MCELIECE348864F_AVX_CRYPTO_SECRETKEYBYTES 6452 +#define PQCLEAN_MCELIECE348864F_AVX_CRYPTO_CIPHERTEXTBYTES 128 +#define PQCLEAN_MCELIECE348864F_AVX_CRYPTO_BYTES 32 + + + +int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/benes.c b/crypto_kem/mceliece348864f/avx/benes.c new file mode 100644 index 00000000..8d3f218d --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/benes.c @@ -0,0 +1,287 @@ +/* + This file is for Benes network related functions +*/ +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_0(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = bs[ x ] ^ bs[ x + 1 ]; + diff &= *cond++; + bs[ x ] ^= diff; + bs[ x + 1 ] ^= diff; + } +} + +static void layer_1(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = bs[ x + 0 ] ^ bs[ x + 2 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 2 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 3 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 3 ] ^= diff; + + cond += 2; + } +} + +static void layer_2(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = bs[ x + 0 ] ^ bs[ x + 4 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 4 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 5 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 5 ] ^= diff; + + diff = bs[ x + 2 ] ^ bs[ x + 6 ]; + diff &= cond[2]; + bs[ x + 2 ] ^= diff; + bs[ x + 6 ] ^= diff; + + diff = bs[ x + 3 ] ^ bs[ x + 7 ]; + diff &= cond[3]; + bs[ x + 3 ] ^= diff; + bs[ x + 7 ] ^= diff; + + cond += 4; + } +} + +static void layer_3(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 8 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 8 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 9 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 9 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 10 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 10 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 11 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 11 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_4(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 16 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 16 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 17 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 17 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 18 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 18 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 19 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 19 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_5(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 32 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 32 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 33 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 33 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 34 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 34 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 35 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 35 ] ^= diff; + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: out, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE348864F_AVX_load_bits(uint64_t out[][32], const unsigned char *bits) { + int i, low, block = 0; + + uint64_t cond[64]; + + // + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_AVX_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864F_AVX_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 4; low >= 0; low--) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864F_AVX_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 5; low >= 0; low--) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_AVX_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } +} + +/* input: r, sequence of bits to be permuted */ +/* cond, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE348864F_AVX_benes(uint64_t *r, uint64_t cond[][32], int rev) { + int block, inc; + + uint64_t *bs = r; + + // + + if (rev == 0) { + block = 0; + inc = 1; + } else { + block = 22; + inc = -1; + } + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(bs); + + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + //block += inc; + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(bs); +} + diff --git a/crypto_kem/mceliece348864f/avx/benes.h b/crypto_kem/mceliece348864f/avx/benes.h new file mode 100644 index 00000000..53398f3e --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/benes.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_BENES_H +#define PQCLEAN_MCELIECE348864F_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "gf.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864F_AVX_load_bits(uint64_t /*out*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE348864F_AVX_benes(uint64_t * /*r*/, uint64_t /*cond*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/bm.c b/crypto_kem/mceliece348864f/avx/bm.c new file mode 100644 index 00000000..bfd6bdcb --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/bm.c @@ -0,0 +1,219 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "util.h" +#include "vec.h" +#include "vec128.h" + +#include + +extern void PQCLEAN_MCELIECE348864F_AVX_update_asm(void *, gf, int); +extern gf PQCLEAN_MCELIECE348864F_AVX_vec_reduce_asm(uint64_t *); + +static inline uint64_t mask_nonzero(gf a) { + uint64_t ret = a; + + ret -= 1; + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline uint64_t mask_leq(uint16_t a, uint16_t b) { + uint64_t a_tmp = a; + uint64_t b_tmp = b; + uint64_t ret = b_tmp - a_tmp; + + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline void vec_cmov(uint64_t out[][2], uint64_t mask) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i][0] = (out[i][0] & ~mask) | (out[i][1] & mask); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE348864F_AVX_vec128_or(PQCLEAN_MCELIECE348864F_AVX_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE348864F_AVX_vec128_sll_2x(PQCLEAN_MCELIECE348864F_AVX_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE348864F_AVX_vec128_or(PQCLEAN_MCELIECE348864F_AVX_vec128_srl_2x(PQCLEAN_MCELIECE348864F_AVX_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE348864F_AVX_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < GFBITS; i++) { + buf[i] = in[i]; + } + for (i = GFBITS; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE348864F_AVX_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864F_AVX_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864F_AVX_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +void PQCLEAN_MCELIECE348864F_AVX_bm(uint64_t out[ GFBITS ], vec128 in[ GFBITS ]) { + uint16_t i; + uint16_t N, L; + + uint64_t prod[ GFBITS ]; + uint64_t in_tmp[ GFBITS ]; + + uint64_t db[ GFBITS ][ 2 ]; + uint64_t BC_tmp[ GFBITS ][ 2 ]; + uint64_t BC[ GFBITS ][ 2 ]; + + uint64_t mask, t; + + gf d, b, c0 = 1; + + gf coefs[SYS_T * 2]; + + // init + + BC[0][1] = 0; + BC[0][0] = 1; + BC[0][0] <<= 63; + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = 0; + } + + b = 1; + L = 0; + + // + + get_coefs(coefs, in); + + for (i = 0; i < GFBITS; i++) { + in_tmp[i] = 0; + } + + for (N = 0; N < SYS_T * 2; N++) { + // computing d + + PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp(prod, in_tmp, &BC[0][0]); + + PQCLEAN_MCELIECE348864F_AVX_update_asm(in_tmp, coefs[N], 8); + + d = PQCLEAN_MCELIECE348864F_AVX_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE348864F_AVX_gf_mul2(c0, coefs[N], b); + + d ^= t & 0xFFFFFFFF; + + // 3 cases + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = (d >> i) & 1; + db[i][0] = -db[i][0]; + db[i][1] = (b >> i) & 1; + db[i][1] = -db[i][1]; + } + + PQCLEAN_MCELIECE348864F_AVX_vec128_mul((vec128 *) BC_tmp, (vec128 *) db, (vec128 *) BC); + + vec_cmov(BC, mask); + + PQCLEAN_MCELIECE348864F_AVX_update_asm(BC, mask & c0, 16); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = BC_tmp[i][0] ^ BC_tmp[i][1]; + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + + } + + c0 = PQCLEAN_MCELIECE348864F_AVX_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + out[i] = (c0 >> i) & 1; + out[i] = -out[i]; + } + + PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp(out, out, &BC[0][0]); +} + diff --git a/crypto_kem/mceliece348864f/avx/bm.h b/crypto_kem/mceliece348864f/avx/bm.h new file mode 100644 index 00000000..6852977a --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_BM_H +#define PQCLEAN_MCELIECE348864F_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE348864F_AVX_bm(uint64_t * /*out*/, vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/consts.S b/crypto_kem/mceliece348864f/avx/consts.S new file mode 100644 index 00000000..63e6defe --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE348864F_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE348864F_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE348864F_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE348864F_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE348864F_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE348864F_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE348864F_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE348864F_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE348864F_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE348864F_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE348864F_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE348864F_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE348864F_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece348864f/avx/consts.inc b/crypto_kem/mceliece348864f/avx/consts.inc new file mode 100644 index 00000000..c93f9c79 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/consts.inc @@ -0,0 +1,238 @@ +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555, 0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33333333CCCCCCCC, 0x33333333CCCCCCCC, 0x33333333CCCCCCCC, 0x33333333CCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF, 0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55, 0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0x33CC33CCCC33CC33, 0x33CC33CCCC33CC33), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF0000FF00FFFF00, 0xFF0000FF00FFFF00, 0x00FFFF00FF0000FF, 0x00FFFF00FF0000FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3, 0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555, 0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F, 0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC33333333CCCC, 0x3333CCCCCCCC3333, 0x3333CCCCCCCC3333, 0xCCCC33333333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x6699669999669966, 0x9966996666996699, 0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F, 0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x6699669999669966, 0x9966996666996699, 0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0, 0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F, 0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, diff --git a/crypto_kem/mceliece348864f/avx/controlbits.c b/crypto_kem/mceliece348864f/avx/controlbits.c new file mode 100644 index 00000000..08caa6ef --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE348864F_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE348864F_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE348864F_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE348864F_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece348864f/avx/controlbits.h b/crypto_kem/mceliece348864f/avx/controlbits.h new file mode 100644 index 00000000..e70e727d --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE348864F_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE348864F_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE348864F_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/crypto_hash.h b/crypto_kem/mceliece348864f/avx/crypto_hash.h new file mode 100644 index 00000000..a294e428 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE348864F_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece348864f/avx/decrypt.c b/crypto_kem/mceliece348864f/avx/decrypt.c new file mode 100644 index 00000000..7737514c --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/decrypt.c @@ -0,0 +1,234 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + uint64_t sk_int[ GFBITS ]; + vec256 eval[16][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE348864F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE348864F_AVX_fft(eval, sk_int); + + for (i = 0; i < 16; i++) { + PQCLEAN_MCELIECE348864F_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE348864F_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 16; i++) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864F_AVX_vec256_inv(tmp, inv[15]); + + for (i = 14; i >= 0; i--) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864F_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 16; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 512 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 512; i++) { + r[i] = 0; + } + + for (i = 0; i < 32; i++) { + recv[i] = PQCLEAN_MCELIECE348864F_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE348864F_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE348864F_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 16; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 32; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864F_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864F_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec128 *s0, vec128 *s1) { + int i; + vec128 diff; + + diff = PQCLEAN_MCELIECE348864F_AVX_vec128_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE348864F_AVX_vec128_or(diff, PQCLEAN_MCELIECE348864F_AVX_vec128_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE348864F_AVX_vec128_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 16; i++) { + v[0] = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 16; i++) { + v[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE348864F_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 16 ][ GFBITS ]; + vec256 scaled[ 16 ][ GFBITS ]; + vec256 eval[16][ GFBITS ]; + + vec128 error128[ 32 ]; + vec256 error256[ 16 ]; + + vec128 s_priv[ GFBITS ]; + vec128 s_priv_cmp[ GFBITS ]; + uint64_t locator[ GFBITS ]; + + vec128 recv128[ 32 ]; + vec256 recv256[ 16 ]; + vec256 allone; + + uint64_t bits_int[23][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE348864F_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE348864F_AVX_benes((uint64_t *) recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE348864F_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE348864F_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE348864F_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE348864F_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 16; i++) { + error256[i] = PQCLEAN_MCELIECE348864F_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE348864F_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE348864F_AVX_benes((uint64_t *) error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece348864f/avx/decrypt.h b/crypto_kem/mceliece348864f/avx/decrypt.h new file mode 100644 index 00000000..962f5402 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE348864F_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE348864F_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/encrypt.c b/crypto_kem/mceliece348864f/avx/encrypt.c new file mode 100644 index 00000000..09283ffd --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/encrypt.c @@ -0,0 +1,99 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "gf.h" +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE348864F_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE348864F_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE348864F_AVX_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864F_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE348864F_AVX_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece348864f/avx/encrypt.h b/crypto_kem/mceliece348864f/avx/encrypt.h new file mode 100644 index 00000000..ea547501 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE348864F_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE348864F_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/fft.c b/crypto_kem/mceliece348864f/avx/fft.c new file mode 100644 index 00000000..74f95edb --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/fft.c @@ -0,0 +1,172 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "vec.h" + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(uint64_t *in) { + int i, j, k; + + const uint64_t mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const uint64_t s[5][GFBITS] = { +#include "scalars.inc" + }; + + // + + for (j = 0; j <= 4; j++) { + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[i] ^= (in[i] & mask[k][0]) >> (1 << k); + in[i] ^= (in[i] & mask[k][1]) >> (1 << k); + } + } + + PQCLEAN_MCELIECE348864F_AVX_vec_mul(in, in, s[j]); // scaling + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], const uint64_t *in) { + int i, j, k, s, b; + + uint64_t t0, t1, t2, t3; + + const vec256 consts[ 17 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 0; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + // boradcast + + vec256 tmp256[ GFBITS ]; + vec256 x[ GFBITS ], y[ GFBITS ]; + + for (j = 0; j < 64; j += 8) { + for (i = 0; i < GFBITS; i++) { + t0 = (in[i] >> reversal[j + 0]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 2]) & 1; + t1 = -t1; + t2 = (in[i] >> reversal[j + 4]) & 1; + t2 = -t2; + t3 = (in[i] >> reversal[j + 6]) & 1; + t3 = -t3; + + out[j / 4 + 0][i] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(t0, t1, t2, t3); + + t0 = (in[i] >> reversal[j + 1]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 3]) & 1; + t1 = -t1; + t2 = (in[i] >> reversal[j + 5]) & 1; + t2 = -t2; + t3 = (in[i] >> reversal[j + 7]) & 1; + t3 = -t3; + + out[j / 4 + 1][i] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(t0, t1, t2, t3); + } + } + + // + + for (i = 0; i < 16; i += 2) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, out[i + 1], consts[ 0 ]); + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] ^= tmp256[b]; + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] ^= out[i + 0][b]; + } + } + + for (i = 0; i < 16; i += 2) { + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low_2x(out[i + 0][b], out[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high_2x(out[i + 0][b], out[i + 1][b]); + } + + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, y, consts[ 1 ]); + + for (b = 0; b < GFBITS; b++) { + x[b] ^= tmp256[b]; + } + for (b = 0; b < GFBITS; b++) { + y[b] ^= x[b]; + } + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low(x[b], y[b]); + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high(x[b], y[b]); + } + } + + consts_ptr = 2; + + for (i = 0; i <= 3; i++) { + s = 1 << i; + + for (j = 0; j < 16; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] ^= tmp256[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += s; + } + + // adding the part contributed by x^64 + + vec256 powers[16][GFBITS] = { +#include "powers.inc" + }; + + for (i = 0; i < 16; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(out[i][b], powers[i][b]); + } + } +} + +void PQCLEAN_MCELIECE348864F_AVX_fft(vec256 out[][ GFBITS ], uint64_t *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece348864f/avx/fft.h b/crypto_kem/mceliece348864f/avx/fft.h new file mode 100644 index 00000000..fe39685a --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/fft.h @@ -0,0 +1,18 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_FFT_H +#define PQCLEAN_MCELIECE348864F_AVX_FFT_H + +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include + +#include "params.h" +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE348864F_AVX_fft(vec256 /*out*/[][GFBITS], uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/fft_tr.c b/crypto_kem/mceliece348864f/avx/fft_tr.c new file mode 100644 index 00000000..a8e1467d --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/fft_tr.c @@ -0,0 +1,355 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" +#include "vec.h" + +#include + +static void radix_conversions_tr(vec128 in[ GFBITS ]) { + int i, j, k; + + const vec128 mask[10] = { + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + uint64_t v0, v1; + + // + + for (j = 5; j >= 0; j--) { + + if (j < 5) { + PQCLEAN_MCELIECE348864F_AVX_vec128_mul(in, in, s[j]); + } + + for (i = 0; i < GFBITS; i++) { + for (k = j; k <= 4; k++) { + in[i] ^= PQCLEAN_MCELIECE348864F_AVX_vec128_sll_2x(in[i] & mask[2 * k + 0], 1 << k); + in[i] ^= PQCLEAN_MCELIECE348864F_AVX_vec128_sll_2x(in[i] & mask[2 * k + 1], 1 << k); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in[i], 0); + v1 = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in[i], 1); + + v1 ^= v0 >> 32; + v1 ^= v1 << 32; + + in[i] = PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(v0, v1); + } + } +} + +static void butterflies_tr(vec128 out[ GFBITS ], vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + uint64_t tmp[ GFBITS ]; + uint64_t pre[6][ GFBITS ]; + uint64_t out64[2][64]; + + vec256 p2[ 6 ]; + vec256 buf[64]; + vec256 x[ GFBITS ], y[ GFBITS ]; + vec256 tmp256[ GFBITS ]; + + const vec256 consts[ 17 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 17; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {8, 1300, 3408, 1354, 2341, 1154}; + + // butterflies + + for (i = 3; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 16; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, in[k], consts[ consts_ptr + (k - j) ]); + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tmp256[b]; + } + } + } + } + + for (i = 0; i < 16; i += 2) { + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low(in[i + 0][b], in[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high(in[i + 0][b], in[i + 1][b]); + } + + for (b = 0; b < GFBITS; b++) { + x[b] ^= y[b]; + } + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, x, consts[ 1 ]); + for (b = 0; b < GFBITS; b++) { + y[b] ^= tmp256[b]; + } + + for (b = 0; b < GFBITS; b++) { + in[i + 0][b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low_2x(x[b], y[b]); + } + for (b = 0; b < GFBITS; b++) { + in[i + 1][b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high_2x(x[b], y[b]); + } + } + + for (i = 0; i < 16; i += 2) { + for (b = 0; b < GFBITS; b++) { + in[i + 0][b] ^= in[i + 1][b]; + } + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, in[i + 0], consts[ 0 ]); + for (b = 0; b < GFBITS; b++) { + in[i + 1][b] ^= tmp256[b]; + } + } + + // transpose + + for (i = 0; i < GFBITS; i += 4) { + for (j = 0; j < 64; j += 8) { + buf[ reversal[j + 0] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 0], 0), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 1], 0), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 2], 0), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 3], 0)); + buf[ reversal[j + 1] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 0], 0), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 1], 0), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 2], 0), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 3], 0)); + buf[ reversal[j + 2] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 0], 1), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 1], 1), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 2], 1), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 3], 1)); + buf[ reversal[j + 3] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 0], 1), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 1], 1), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 2], 1), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 3], 1)); + buf[ reversal[j + 4] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 0], 2), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 1], 2), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 2], 2), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 3], 2)); + buf[ reversal[j + 5] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 0], 2), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 1], 2), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 2], 2), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 3], 2)); + buf[ reversal[j + 6] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 0], 3), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 1], 3), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 2], 3), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 3], 3)); + buf[ reversal[j + 7] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 0], 3), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 1], 3), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 2], 3), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 3], 3)); + } + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x256_sp(buf); + + p2[0] = buf[32]; + buf[33] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[33], buf[32]); + p2[1] = buf[33]; + buf[35] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[35], buf[33]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[35]); + buf[34] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[34], buf[35]); + p2[2] = buf[34]; + buf[38] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[38], buf[34]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[38]); + buf[39] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[39], buf[38]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[39]); + buf[37] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[37], buf[39]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[37]); + buf[36] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[36], buf[37]); + p2[3] = buf[36]; + buf[44] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[44], buf[36]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[44]); + buf[45] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[45], buf[44]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[45]); + buf[47] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[47], buf[45]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[47]); + buf[46] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[46], buf[47]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[46]); + buf[42] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[42], buf[46]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[42]); + buf[43] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[43], buf[42]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[43]); + buf[41] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[41], buf[43]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[41]); + buf[40] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[40], buf[41]); + p2[4] = buf[40]; + buf[56] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[56], buf[40]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[56]); + buf[57] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[57], buf[56]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[57]); + buf[59] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[59], buf[57]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[59]); + buf[58] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[58], buf[59]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[58]); + buf[62] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[62], buf[58]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[62]); + buf[63] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[63], buf[62]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[63]); + buf[61] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[61], buf[63]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[61]); + buf[60] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[60], buf[61]); + p2[3] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[3], buf[60]); + buf[52] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[52], buf[60]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[52]); + buf[53] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[53], buf[52]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[53]); + buf[55] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[55], buf[53]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[55]); + buf[54] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[54], buf[55]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[54]); + buf[50] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[50], buf[54]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[50]); + buf[51] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[51], buf[50]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[51]); + buf[49] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[49], buf[51]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[49]); + buf[48] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[48], buf[49]); + p2[5] = buf[48]; + buf[16] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[16], buf[48]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[16]); + buf[17] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[17], buf[16]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[17]); + buf[19] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[19], buf[17]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[19]); + buf[18] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[18], buf[19]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[18]); + buf[22] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[22], buf[18]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[22]); + buf[23] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[23], buf[22]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[23]); + buf[21] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[21], buf[23]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[21]); + buf[20] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[20], buf[21]); + p2[3] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[3], buf[20]); + buf[28] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[28], buf[20]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[28]); + buf[29] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[29], buf[28]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[29]); + buf[31] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[31], buf[29]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[31]); + buf[30] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[30], buf[31]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[30]); + buf[26] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[26], buf[30]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[26]); + buf[27] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[27], buf[26]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[27]); + buf[25] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[25], buf[27]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[25]); + buf[24] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[24], buf[25]); + p2[4] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[4], buf[24]); + buf[8] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[8], buf[24]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[8]); + buf[9] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[9], buf[8]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[9]); + buf[11] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[11], buf[9]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[11]); + buf[10] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[10], buf[11]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[10]); + buf[14] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[14], buf[10]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[14]); + buf[15] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[15], buf[14]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[15]); + buf[13] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[13], buf[15]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[13]); + buf[12] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[12], buf[13]); + p2[3] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[3], buf[12]); + buf[4] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[4], buf[12]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[4]); + buf[5] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[5], buf[4]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[5]); + buf[7] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[7], buf[5]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[7]); + buf[6] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[6], buf[7]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[6]); + buf[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[2], buf[6]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[2]); + buf[3] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[3], buf[2]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[3]); + buf[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[1], buf[3]); + + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[1]); + buf[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[0], buf[1]); + + for (j = 0; j < 6; j++) { + pre[j][i + 0] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(p2[j], 0); + pre[j][i + 1] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(p2[j], 1); + pre[j][i + 2] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(p2[j], 2); + pre[j][i + 3] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(p2[j], 3); + } + + out64[0][i + 0] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(buf[0], 0); + out64[0][i + 1] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(buf[0], 1); + out64[0][i + 2] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(buf[0], 2); + out64[0][i + 3] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(buf[0], 3); + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = (beta[0] >> j) & 1; + tmp[j] = -tmp[j]; + } + + PQCLEAN_MCELIECE348864F_AVX_vec_mul(out64[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = (beta[i] >> j) & 1; + tmp[j] = -tmp[j]; + } + + PQCLEAN_MCELIECE348864F_AVX_vec_mul(tmp, pre[i], tmp); + PQCLEAN_MCELIECE348864F_AVX_vec_add(out64[1], out64[1], tmp); + } + + for (i = 0; i < GFBITS; i++) { + out[i] = PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(out64[0][i], out64[1][i]); + } +} + +void PQCLEAN_MCELIECE348864F_AVX_fft_tr(vec128 out[GFBITS], vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece348864f/avx/fft_tr.h b/crypto_kem/mceliece348864f/avx/fft_tr.h new file mode 100644 index 00000000..0b65c324 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE348864F_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE348864F_AVX_fft_tr(vec128 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/gf.c b/crypto_kem/mceliece348864f/avx/gf.c new file mode 100644 index 00000000..f452be9f --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/gf.c @@ -0,0 +1,169 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE348864F_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 20; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE348864F_AVX_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE348864F_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint32_t tmp; + uint32_t t0; + uint32_t t1; + uint32_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & (1 << i))); + } + + t = tmp & 0x7FC000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + t = tmp & 0x3000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + return tmp & ((1 << GFBITS) - 1); +} + +/* input: field element in */ +/* return: in^2 */ +static inline gf gf_sq(gf in) { + const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; + + uint32_t x = in; + uint32_t t; + + x = (x | (x << 8)) & B[3]; + x = (x | (x << 4)) & B[2]; + x = (x | (x << 2)) & B[1]; + x = (x | (x << 1)) & B[0]; + + t = x & 0x7FC000; + x ^= t >> 9; + x ^= t >> 12; + + t = x & 0x3000; + x ^= t >> 9; + x ^= t >> 12; + + return x & ((1 << GFBITS) - 1); +} + +gf PQCLEAN_MCELIECE348864F_AVX_gf_inv(gf in) { + gf tmp_11; + gf tmp_1111; + + gf out = in; + + out = gf_sq(out); + tmp_11 = PQCLEAN_MCELIECE348864F_AVX_gf_mul(out, in); // 11 + + out = gf_sq(tmp_11); + out = gf_sq(out); + tmp_1111 = PQCLEAN_MCELIECE348864F_AVX_gf_mul(out, tmp_11); // 1111 + + out = gf_sq(tmp_1111); + out = gf_sq(out); + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_AVX_gf_mul(out, tmp_1111); // 11111111 + + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_AVX_gf_mul(out, tmp_11); // 1111111111 + + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_AVX_gf_mul(out, in); // 11111111111 + + return gf_sq(out); // 111111111110 +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE348864F_AVX_gf_frac(gf den, gf num) { + return PQCLEAN_MCELIECE348864F_AVX_gf_mul(PQCLEAN_MCELIECE348864F_AVX_gf_inv(den), num); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE348864F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE348864F_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864F_AVX_gf_mul(prod[i], (gf) 877); + prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864F_AVX_gf_mul(prod[i], (gf) 2888); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864F_AVX_gf_mul(prod[i], (gf) 1781); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864F_AVX_gf_mul(prod[i], (gf) 373); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864F_AVX_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x007FC000007FC000; + tmp ^= (t >> 9) ^ (t >> 12); + + t = tmp & 0x0000300000003000; + tmp ^= (t >> 9) ^ (t >> 12); + + return tmp & 0x00000FFF00000FFF; +} + diff --git a/crypto_kem/mceliece348864f/avx/gf.h b/crypto_kem/mceliece348864f/avx/gf.h new file mode 100644 index 00000000..5b7ca3f3 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/gf.h @@ -0,0 +1,26 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_GF_H +#define PQCLEAN_MCELIECE348864F_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE348864F_AVX_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE348864F_AVX_gf_add(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864F_AVX_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864F_AVX_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE348864F_AVX_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE348864F_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864F_AVX_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/int32_sort.c b/crypto_kem/mceliece348864f/avx/int32_sort.c new file mode 100644 index 00000000..6ab02550 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/int32_sort.c @@ -0,0 +1,1208 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = (p << 1 == q); + flipflip = !flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE348864F_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE348864F_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/crypto_kem/mceliece348864f/avx/int32_sort.h b/crypto_kem/mceliece348864f/avx/int32_sort.h new file mode 100644 index 00000000..3d07fbfb --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE348864F_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE348864F_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece348864f/avx/operations.c b/crypto_kem/mceliece348864f/avx/operations.c new file mode 100644 index 00000000..cefb5d6b --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE348864F_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864F_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE348864F_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE348864F_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE348864F_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE348864F_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE348864F_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE348864F_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE348864F_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE348864F_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864f/avx/params.h b/crypto_kem/mceliece348864f/avx/params.h new file mode 100644 index 00000000..c0474ae1 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_PARAMS_H +#define PQCLEAN_MCELIECE348864F_AVX_PARAMS_H + +#define GFBITS 12 +#define SYS_N 3488 +#define SYS_T 64 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/pk_gen.c b/crypto_kem/mceliece348864f/avx/pk_gen.c new file mode 100644 index 00000000..0d61f360 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/pk_gen.c @@ -0,0 +1,329 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 16; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 16; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 255) / 256) * 4 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << 32 >> 32) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> 32 << 32) | (buf[j] >> 32); + } + } + + return 0; +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE348864F_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + + uint64_t mask; + + uint64_t sk_int[ GFBITS ]; + + vec256 consts[ 16 ][ GFBITS ]; + vec256 eval[ 16 ][ GFBITS ]; + vec256 prod[ 16 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE348864F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE348864F_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE348864F_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 16; i++) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864F_AVX_vec256_inv(tmp, prod[15]); + + for (i = 14; i >= 0; i--) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864F_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE348864F_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = NBLOCKS1_I; j < NBLOCKS1_H - 1; j++) { + PQCLEAN_MCELIECE348864F_AVX_store8(pk, mat[i][j]); + pk += 8; + } + + PQCLEAN_MCELIECE348864F_AVX_store_i(pk, mat[i][j], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece348864f/avx/pk_gen.h b/crypto_kem/mceliece348864f/avx/pk_gen.h new file mode 100644 index 00000000..e8a1f9db --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE348864F_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include "gf.h" + +int PQCLEAN_MCELIECE348864F_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/powers.inc b/crypto_kem/mceliece348864f/avx/powers.inc new file mode 100644 index 00000000..cb21ce5b --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/powers.inc @@ -0,0 +1,224 @@ +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, diff --git a/crypto_kem/mceliece348864f/avx/scalars.inc b/crypto_kem/mceliece348864f/avx/scalars.inc new file mode 100644 index 00000000..aa8f64b9 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/scalars.inc @@ -0,0 +1,70 @@ +{ + 0XF3CFC030FC30F003, + 0X3FCF0F003C00C00C, + 0X30033CC300C0C03C, + 0XCCFF0F3C0F30F0C0, + 0X0300C03FF303C3F0, + 0X3FFF3C0FF0CCCCC0, + 0XF3FFF0C00F3C3CC0, + 0X3003333FFFC3C000, + 0X0FF30FFFC3FFF300, + 0XFFC0F300F0F0CC00, + 0XC0CFF3FCCC3CFC00, + 0XFC3C03F0F330C000, +}, +{ + 0X000F00000000F00F, + 0X00000F00F00000F0, + 0X0F00000F00000F00, + 0XF00F00F00F000000, + 0X00F00000000000F0, + 0X0000000F00000000, + 0XF00000000F00F000, + 0X00F00F00000F0000, + 0X0000F00000F00F00, + 0X000F00F00F00F000, + 0X00F00F0000000000, + 0X0000000000F00000, +}, +{ + 0X0000FF00FF0000FF, + 0X0000FF000000FF00, + 0XFF0000FF00FF0000, + 0XFFFF0000FF000000, + 0X00FF00FF00FF0000, + 0X0000FFFFFF000000, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF0000, + 0XFFFF00FFFF00FF00, + 0X0000FF0000000000, + 0XFFFFFF00FF000000, + 0X00FF000000000000, +}, +{ + 0X000000000000FFFF, + 0X00000000FFFF0000, + 0X0000000000000000, + 0XFFFF000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +}, +{ + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +} diff --git a/crypto_kem/mceliece348864f/avx/scalars_2x.inc b/crypto_kem/mceliece348864f/avx/scalars_2x.inc new file mode 100644 index 00000000..5d690ca2 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/scalars_2x.inc @@ -0,0 +1,70 @@ +{ + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xf3cfc030fc30f003, 0x000c03c0c3c0330c), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x3fcf0f003c00c00c, 0xf330cffcc00f33c0), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x30033cc300c0c03c, 0xccf330f00f3c0333), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xccff0f3c0f30f0c0, 0xff03fff3ff0cf0c0), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0300c03ff303c3f0, 0x3cc3fcf00fcc303c), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x3fff3c0ff0ccccc0, 0x0f000c0fc30303f3), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xf3fff0c00f3c3cc0, 0xcf0fc3ff333ccf3c), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x3003333fffc3c000, 0x003f3fc3c0ff333f), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0ff30fffc3fff300, 0x3cc3f0f3cf0ff00f), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffc0f300f0f0cc00, 0xf3f33cc03fc30cc0), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xc0cff3fccc3cfc00, 0x3cc330cfc333f33f), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xfc3c03f0f330c000, 0x3cc0303ff3c3fffc), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x000f00000000f00f, 0x0f00f00f00000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00000f00f00000f0, 0xf00000000000f000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0f00000f00000f00, 0x00000f00000000f0), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xf00f00f00f000000, 0x0f00f00000f00000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00f00000000000f0, 0x000f00000f00f00f), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000f00000000, 0x00f00f00f00f0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xf00000000f00f000, 0x0f00f00000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00f00f00000f0000, 0x000000000f000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000f00000f00f00, 0x00f00000000f00f0), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x000f00f00f00f000, 0x0000f00f00000f00), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00f00f0000000000, 0xf00000f00000f00f), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000f00000, 0x00000f00f00f00f0), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000ff00ff0000ff, 0xff00ffffff000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000ff000000ff00, 0xff0000ffff000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xff0000ff00ff0000, 0xffff00ffff000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffff0000ff000000, 0xff00ffffffffff00), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00ff00ff00ff0000, 0x00000000ff00ff00), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000ffffff000000, 0xffffffff00ff0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00ffff00ff000000, 0x00ffffff00ff0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffffff0000ff0000, 0xffff00ffff00ffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffff00ffff00ff00, 0xffff0000ffffffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000ff0000000000, 0xff00000000ff0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffffff00ff000000, 0x000000ff00ff00ff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00ff000000000000, 0x00ff00ff00ffff00), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x000000000000ffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00000000ffff0000, 0xffff000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffff000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000ffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0x0000ffff00000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000ffff00000000, 0x00000000ffff0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0x00000000ffff0000), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00000000ffffffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffffffff00000000, 0x00000000ffffffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffffffff00000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), +}, diff --git a/crypto_kem/mceliece348864f/avx/sk_gen.c b/crypto_kem/mceliece348864f/avx/sk_gen.c new file mode 100644 index 00000000..29a7f766 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE348864F_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE348864F_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE348864F_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE348864F_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE348864F_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864F_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE348864F_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE348864F_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864f/avx/sk_gen.h b/crypto_kem/mceliece348864f/avx/sk_gen.h new file mode 100644 index 00000000..fd47ebb4 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE348864F_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE348864F_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE348864F_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/syndrome_asm.S b/crypto_kem/mceliece348864f/avx/syndrome_asm.S new file mode 100644 index 00000000..06cc0f2e --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/syndrome_asm.S @@ -0,0 +1,530 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_syndrome_asm +.global PQCLEAN_MCELIECE348864F_AVX_syndrome_asm +_PQCLEAN_MCELIECE348864F_AVX_syndrome_asm: +PQCLEAN_MCELIECE348864F_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 260780 +# asm 1: add $260780,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 768 +# asm 1: mov $768,>row=int64#5 +# asm 2: mov $768,>row=%r8 +mov $768,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#3 +# asm 2: vmovupd 128(ee=%ymm2 +vmovupd 128(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#3 +# asm 2: vmovupd 160(ee=%ymm2 +vmovupd 160(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 192 ] +# asm 1: vmovupd 192(ee=reg256#3 +# asm 2: vmovupd 192(ee=%ymm2 +vmovupd 192(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 224 ] +# asm 1: vmovupd 224(ee=reg256#3 +# asm 2: vmovupd 224(ee=%ymm2 +vmovupd 224(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 256 ] +# asm 1: vmovupd 256(ee=reg256#3 +# asm 2: vmovupd 256(ee=%ymm2 +vmovupd 256(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 288 ] +# asm 1: vmovupd 288(ee=reg256#3 +# asm 2: vmovupd 288(ee=%ymm2 +vmovupd 288(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 320 ] +# asm 1: vmovupd 320(ee=reg256#3 +# asm 2: vmovupd 320(ee=%ymm2 +vmovupd 320(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 352 ] +# asm 1: vmovupd 352(ee=reg256#3 +# asm 2: vmovupd 352(ee=%ymm2 +vmovupd 352(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 384 ] +# asm 1: vmovupd 384(ee=reg256#3 +# asm 2: vmovupd 384(ee=%ymm2 +vmovupd 384(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = *(uint64 *)(input_1 + 320) +# asm 1: movq 320(s=int64#6 +# asm 2: movq 320(s=%r9 +movq 320(%rsi),%r9 + +# qhasm: e = *(uint64 *)(input_2 + 416) +# asm 1: movq 416(e=int64#7 +# asm 2: movq 416(e=%rax +movq 416(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 328(p=%rax +movq 328(%rsi),%rax + +# qhasm: e = *(uint64 *)(input_2 + 424) +# asm 1: movq 424(e=int64#8 +# asm 2: movq 424(e=%r10 +movq 424(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and p=int64#7d +# asm 2: movl 336(p=%eax +movl 336(%rsi),%eax + +# qhasm: e = *(uint32 *)(input_2 + 432) +# asm 1: movl 432(e=int64#8d +# asm 2: movl 432(e=%r10d +movl 432(%rdx),%r10d + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor + + +void PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(uint64_t *in); +void PQCLEAN_MCELIECE348864F_AVX_transpose_64x256_sp(vec256 *in); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/transpose_64x256_sp_asm.S b/crypto_kem/mceliece348864f/avx/transpose_64x256_sp_asm.S new file mode 100644 index 00000000..17c4412b --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/transpose_64x256_sp_asm.S @@ -0,0 +1,8145 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 x0 + +# qhasm: reg256 x1 + +# qhasm: reg256 x2 + +# qhasm: reg256 x3 + +# qhasm: reg256 x4 + +# qhasm: reg256 x5 + +# qhasm: reg256 x6 + +# qhasm: reg256 x7 + +# qhasm: reg256 t0 + +# qhasm: reg256 t1 + +# qhasm: reg256 v00 + +# qhasm: reg256 v01 + +# qhasm: reg256 v10 + +# qhasm: reg256 v11 + +# qhasm: reg256 mask0 + +# qhasm: reg256 mask1 + +# qhasm: reg256 mask2 + +# qhasm: reg256 mask3 + +# qhasm: reg256 mask4 + +# qhasm: reg256 mask5 + +# qhasm: enter transpose_64x256_sp_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_transpose_64x256_sp_asm +.global PQCLEAN_MCELIECE348864F_AVX_transpose_64x256_sp_asm +_PQCLEAN_MCELIECE348864F_AVX_transpose_64x256_sp_asm: +PQCLEAN_MCELIECE348864F_AVX_transpose_64x256_sp_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: mask0 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK5_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK5_0,>mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 64 ] x2 +# asm 1: movddup 64(r1=reg128#8 +# asm 2: movddup 64(r1=%xmm7 +movddup 64(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 128 ] x2 +# asm 1: movddup 128(r2=reg128#9 +# asm 2: movddup 128(r2=%xmm8 +movddup 128(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 192 ] x2 +# asm 1: movddup 192(r3=reg128#10 +# asm 2: movddup 192(r3=%xmm9 +movddup 192(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 256 ] x2 +# asm 1: movddup 256(r4=reg128#11 +# asm 2: movddup 256(r4=%xmm10 +movddup 256(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 320 ] x2 +# asm 1: movddup 320(r5=reg128#12 +# asm 2: movddup 320(r5=%xmm11 +movddup 320(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 384 ] x2 +# asm 1: movddup 384(r6=reg128#13 +# asm 2: movddup 384(r6=%xmm12 +movddup 384(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 448 ] x2 +# asm 1: movddup 448(r7=reg128#14 +# asm 2: movddup 448(r7=%xmm13 +movddup 448(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 0 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 64 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 128 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 192 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 256 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 320 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 384 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 448 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 8(r0=%xmm6 +movddup 8(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r2=reg128#9 +# asm 2: movddup 136(r2=%xmm8 +movddup 136(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r3=reg128#10 +# asm 2: movddup 200(r3=%xmm9 +movddup 200(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r4=reg128#11 +# asm 2: movddup 264(r4=%xmm10 +movddup 264(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r5=reg128#12 +# asm 2: movddup 328(r5=%xmm11 +movddup 328(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r6=reg128#13 +# asm 2: movddup 392(r6=%xmm12 +movddup 392(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r7=reg128#14 +# asm 2: movddup 456(r7=%xmm13 +movddup 456(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 8 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 72 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 136 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 200 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 264 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 328 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 392 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 456 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 16(r0=%xmm6 +movddup 16(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r1=reg128#8 +# asm 2: movddup 80(r1=%xmm7 +movddup 80(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r3=reg128#10 +# asm 2: movddup 208(r3=%xmm9 +movddup 208(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r4=reg128#11 +# asm 2: movddup 272(r4=%xmm10 +movddup 272(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r5=reg128#12 +# asm 2: movddup 336(r5=%xmm11 +movddup 336(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r6=reg128#13 +# asm 2: movddup 400(r6=%xmm12 +movddup 400(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r7=reg128#14 +# asm 2: movddup 464(r7=%xmm13 +movddup 464(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 16 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 80 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 144 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 208 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 272 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 336 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 400 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 464 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 24(r0=%xmm6 +movddup 24(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r1=reg128#8 +# asm 2: movddup 88(r1=%xmm7 +movddup 88(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r2=reg128#9 +# asm 2: movddup 152(r2=%xmm8 +movddup 152(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r4=reg128#11 +# asm 2: movddup 280(r4=%xmm10 +movddup 280(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r5=reg128#12 +# asm 2: movddup 344(r5=%xmm11 +movddup 344(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r6=reg128#13 +# asm 2: movddup 408(r6=%xmm12 +movddup 408(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r7=reg128#14 +# asm 2: movddup 472(r7=%xmm13 +movddup 472(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 24 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 88 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 152 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 216 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 280 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 344 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 408 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 472 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 32(r0=%xmm6 +movddup 32(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r1=reg128#8 +# asm 2: movddup 96(r1=%xmm7 +movddup 96(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r2=reg128#9 +# asm 2: movddup 160(r2=%xmm8 +movddup 160(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r3=reg128#10 +# asm 2: movddup 224(r3=%xmm9 +movddup 224(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r5=reg128#12 +# asm 2: movddup 352(r5=%xmm11 +movddup 352(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r6=reg128#13 +# asm 2: movddup 416(r6=%xmm12 +movddup 416(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r7=reg128#14 +# asm 2: movddup 480(r7=%xmm13 +movddup 480(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 32 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 96 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 160 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 224 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 288 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 352 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 416 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 480 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 40(r0=%xmm6 +movddup 40(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r1=reg128#8 +# asm 2: movddup 104(r1=%xmm7 +movddup 104(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r2=reg128#9 +# asm 2: movddup 168(r2=%xmm8 +movddup 168(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r3=reg128#10 +# asm 2: movddup 232(r3=%xmm9 +movddup 232(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r4=reg128#11 +# asm 2: movddup 296(r4=%xmm10 +movddup 296(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r6=reg128#13 +# asm 2: movddup 424(r6=%xmm12 +movddup 424(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r7=reg128#14 +# asm 2: movddup 488(r7=%xmm13 +movddup 488(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 40 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 104 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 168 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 232 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 296 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 360 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 424 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 488 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 48(r0=%xmm6 +movddup 48(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r1=reg128#8 +# asm 2: movddup 112(r1=%xmm7 +movddup 112(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r2=reg128#9 +# asm 2: movddup 176(r2=%xmm8 +movddup 176(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r3=reg128#10 +# asm 2: movddup 240(r3=%xmm9 +movddup 240(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r4=reg128#11 +# asm 2: movddup 304(r4=%xmm10 +movddup 304(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r5=reg128#12 +# asm 2: movddup 368(r5=%xmm11 +movddup 368(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r7=reg128#14 +# asm 2: movddup 496(r7=%xmm13 +movddup 496(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 48 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 112 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 176 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 240 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 304 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 368 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 432 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 496 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 56(r0=%xmm6 +movddup 56(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r1=reg128#8 +# asm 2: movddup 120(r1=%xmm7 +movddup 120(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r2=reg128#9 +# asm 2: movddup 184(r2=%xmm8 +movddup 184(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r3=reg128#10 +# asm 2: movddup 248(r3=%xmm9 +movddup 248(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r4=reg128#11 +# asm 2: movddup 312(r4=%xmm10 +movddup 312(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r5=reg128#12 +# asm 2: movddup 376(r5=%xmm11 +movddup 376(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r6=reg128#13 +# asm 2: movddup 440(r6=%xmm12 +movddup 440(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm3,%rsi + +# qhasm: mem64[ input_0 + 56 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm7,%rsi + +# qhasm: mem64[ input_0 + 120 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 184 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm0,%rsi + +# qhasm: mem64[ input_0 + 248 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 312 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm2,%rsi + +# qhasm: mem64[ input_0 + 376 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm4,%rsi + +# qhasm: mem64[ input_0 + 440 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm1,%rsi + +# qhasm: mem64[ input_0 + 504 ] = buf +# asm 1: movq mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 8 ] x2 +# asm 1: movddup 8(r1=reg128#8 +# asm 2: movddup 8(r1=%xmm7 +movddup 8(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 16 ] x2 +# asm 1: movddup 16(r2=reg128#9 +# asm 2: movddup 16(r2=%xmm8 +movddup 16(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 24 ] x2 +# asm 1: movddup 24(r3=reg128#10 +# asm 2: movddup 24(r3=%xmm9 +movddup 24(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 32 ] x2 +# asm 1: movddup 32(r4=reg128#11 +# asm 2: movddup 32(r4=%xmm10 +movddup 32(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 40 ] x2 +# asm 1: movddup 40(r5=reg128#12 +# asm 2: movddup 40(r5=%xmm11 +movddup 40(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 48 ] x2 +# asm 1: movddup 48(r6=reg128#13 +# asm 2: movddup 48(r6=%xmm12 +movddup 48(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 56 ] x2 +# asm 1: movddup 56(r7=reg128#14 +# asm 2: movddup 56(r7=%xmm13 +movddup 56(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 0 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 16 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 32 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 64(r0=%xmm6 +movddup 64(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r2=reg128#9 +# asm 2: movddup 80(r2=%xmm8 +movddup 80(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r3=reg128#10 +# asm 2: movddup 88(r3=%xmm9 +movddup 88(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r4=reg128#11 +# asm 2: movddup 96(r4=%xmm10 +movddup 96(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r5=reg128#12 +# asm 2: movddup 104(r5=%xmm11 +movddup 104(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r6=reg128#13 +# asm 2: movddup 112(r6=%xmm12 +movddup 112(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r7=reg128#14 +# asm 2: movddup 120(r7=%xmm13 +movddup 120(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 64 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 80 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 96 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 112 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 128(r0=%xmm6 +movddup 128(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r1=reg128#8 +# asm 2: movddup 136(r1=%xmm7 +movddup 136(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r3=reg128#10 +# asm 2: movddup 152(r3=%xmm9 +movddup 152(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r4=reg128#11 +# asm 2: movddup 160(r4=%xmm10 +movddup 160(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r5=reg128#12 +# asm 2: movddup 168(r5=%xmm11 +movddup 168(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r6=reg128#13 +# asm 2: movddup 176(r6=%xmm12 +movddup 176(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r7=reg128#14 +# asm 2: movddup 184(r7=%xmm13 +movddup 184(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 128 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 144 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 160 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 176 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 192(r0=%xmm6 +movddup 192(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r1=reg128#8 +# asm 2: movddup 200(r1=%xmm7 +movddup 200(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r2=reg128#9 +# asm 2: movddup 208(r2=%xmm8 +movddup 208(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r4=reg128#11 +# asm 2: movddup 224(r4=%xmm10 +movddup 224(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r5=reg128#12 +# asm 2: movddup 232(r5=%xmm11 +movddup 232(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r6=reg128#13 +# asm 2: movddup 240(r6=%xmm12 +movddup 240(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r7=reg128#14 +# asm 2: movddup 248(r7=%xmm13 +movddup 248(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 192 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 208 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 224 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 240 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 256(r0=%xmm6 +movddup 256(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r1=reg128#8 +# asm 2: movddup 264(r1=%xmm7 +movddup 264(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r2=reg128#9 +# asm 2: movddup 272(r2=%xmm8 +movddup 272(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r3=reg128#10 +# asm 2: movddup 280(r3=%xmm9 +movddup 280(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r5=reg128#12 +# asm 2: movddup 296(r5=%xmm11 +movddup 296(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r6=reg128#13 +# asm 2: movddup 304(r6=%xmm12 +movddup 304(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r7=reg128#14 +# asm 2: movddup 312(r7=%xmm13 +movddup 312(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 256 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 272 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 288 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 304 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 320(r0=%xmm6 +movddup 320(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r1=reg128#8 +# asm 2: movddup 328(r1=%xmm7 +movddup 328(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r2=reg128#9 +# asm 2: movddup 336(r2=%xmm8 +movddup 336(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r3=reg128#10 +# asm 2: movddup 344(r3=%xmm9 +movddup 344(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r4=reg128#11 +# asm 2: movddup 352(r4=%xmm10 +movddup 352(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r6=reg128#13 +# asm 2: movddup 368(r6=%xmm12 +movddup 368(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r7=reg128#14 +# asm 2: movddup 376(r7=%xmm13 +movddup 376(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 320 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 336 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 352 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 368 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 384(r0=%xmm6 +movddup 384(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r1=reg128#8 +# asm 2: movddup 392(r1=%xmm7 +movddup 392(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r2=reg128#9 +# asm 2: movddup 400(r2=%xmm8 +movddup 400(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r3=reg128#10 +# asm 2: movddup 408(r3=%xmm9 +movddup 408(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r4=reg128#11 +# asm 2: movddup 416(r4=%xmm10 +movddup 416(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r5=reg128#12 +# asm 2: movddup 424(r5=%xmm11 +movddup 424(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r7=reg128#14 +# asm 2: movddup 440(r7=%xmm13 +movddup 440(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 384 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 400 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 416 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 432 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 448(r0=%xmm6 +movddup 448(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r1=reg128#8 +# asm 2: movddup 456(r1=%xmm7 +movddup 456(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r2=reg128#9 +# asm 2: movddup 464(r2=%xmm8 +movddup 464(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r3=reg128#10 +# asm 2: movddup 472(r3=%xmm9 +movddup 472(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r4=reg128#11 +# asm 2: movddup 480(r4=%xmm10 +movddup 480(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r5=reg128#12 +# asm 2: movddup 488(r5=%xmm11 +movddup 488(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r6=reg128#13 +# asm 2: movddup 496(r6=%xmm12 +movddup 496(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#4 +# asm 2: vpunpcklqdq t0=%xmm3 +vpunpcklqdq %xmm7,%xmm3,%xmm3 + +# qhasm: mem128[ input_0 + 448 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm0,%xmm8,%xmm0 + +# qhasm: mem128[ input_0 + 464 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm2,%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 480 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm1,%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 496 ] = t0 +# asm 1: movdqu +#include + +void PQCLEAN_MCELIECE348864F_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece348864f/avx/update_asm.S b/crypto_kem/mceliece348864f/avx/update_asm.S new file mode 100644 index 00000000..6ab3338c --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/update_asm.S @@ -0,0 +1,354 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_update_asm +.global PQCLEAN_MCELIECE348864F_AVX_update_asm +_PQCLEAN_MCELIECE348864F_AVX_update_asm: +PQCLEAN_MCELIECE348864F_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s1 = input_1 +# asm 1: mov s1=int64#2 +# asm 2: mov s1=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864F_AVX_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE348864F_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE348864F_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE348864F_AVX_irr_load(uint64_t *out, const unsigned char *in) { + int i, j; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE348864F_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + out[i] = 0; + } + + for (i = SYS_T; i >= 0; i--) { + for (j = 0; j < GFBITS; j++) { + out[j] <<= 1; + out[j] |= (irr[i] >> j) & 1; + } + } +} + +void PQCLEAN_MCELIECE348864F_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE348864F_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE348864F_AVX_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 4; +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE348864F_AVX_vec128_set2x( PQCLEAN_MCELIECE348864F_AVX_load8(in), PQCLEAN_MCELIECE348864F_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE348864F_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE348864F_AVX_store8(out + 0, PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE348864F_AVX_store8(out + 8, PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece348864f/avx/util.h b/crypto_kem/mceliece348864f/avx/util.h new file mode 100644 index 00000000..e9a6258e --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/util.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_UTIL_H +#define PQCLEAN_MCELIECE348864F_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE348864F_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE348864F_AVX_store2(unsigned char *dest, gf a); + +uint16_t PQCLEAN_MCELIECE348864F_AVX_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE348864F_AVX_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE348864F_AVX_irr_load(uint64_t *out, const unsigned char *in); + +void PQCLEAN_MCELIECE348864F_AVX_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE348864F_AVX_load8(const unsigned char *in); + +gf PQCLEAN_MCELIECE348864F_AVX_bitrev(gf a); + +vec128 PQCLEAN_MCELIECE348864F_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE348864F_AVX_store16(unsigned char *out, vec128 in); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/vec.c b/crypto_kem/mceliece348864f/avx/vec.c new file mode 100644 index 00000000..50e05ee0 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/vec.c @@ -0,0 +1,25 @@ +#include "vec.h" + +#include "params.h" + +extern void PQCLEAN_MCELIECE348864F_AVX_vec_mul_asm(uint64_t *, const uint64_t *, const uint64_t *); +extern void PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp_asm(uint64_t *, const uint64_t *, const uint64_t *); + + +void PQCLEAN_MCELIECE348864F_AVX_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g) { + PQCLEAN_MCELIECE348864F_AVX_vec_mul_asm(h, f, g); +} + + +void PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp(uint64_t *h, const uint64_t *f, const uint64_t *g) { + PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp_asm(h, f, g); +} + +void PQCLEAN_MCELIECE348864F_AVX_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g) { + int b; + + for (b = 0; b < GFBITS; b++) { + h[b] = f[b] ^ g[b]; + } +} + diff --git a/crypto_kem/mceliece348864f/avx/vec.h b/crypto_kem/mceliece348864f/avx/vec.h new file mode 100644 index 00000000..468f3e44 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/vec.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_VEC_H +#define PQCLEAN_MCELIECE348864F_AVX_VEC_H + +#include + + +void PQCLEAN_MCELIECE348864F_AVX_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g); + +void PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp(uint64_t *h, const uint64_t *f, const uint64_t *g); + +void PQCLEAN_MCELIECE348864F_AVX_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g); + +#endif diff --git a/crypto_kem/mceliece348864f/avx/vec128.c b/crypto_kem/mceliece348864f/avx/vec128.c new file mode 100644 index 00000000..e92772f3 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/vec128.c @@ -0,0 +1,83 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE348864F_AVX_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE348864F_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE348864F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE348864F_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE348864F_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE348864F_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/crypto_kem/mceliece348864f/avx/vec128.h b/crypto_kem/mceliece348864f/avx/vec128.h new file mode 100644 index 00000000..15c81fcd --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_VEC128_H +#define PQCLEAN_MCELIECE348864F_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE348864F_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE348864F_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE348864F_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE348864F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE348864F_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/crypto_kem/mceliece348864f/avx/vec128_mul_asm.S b/crypto_kem/mceliece348864f/avx/vec128_mul_asm.S new file mode 100644 index 00000000..b8e33295 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/vec128_mul_asm.S @@ -0,0 +1,1369 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE348864F_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE348864F_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE348864F_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#4 +# asm 2: leaq ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: b11 = mem128[ input_2 + 176 ] x2 +# asm 1: vbroadcasti128 176(b11=reg256#1 +# asm 2: vbroadcasti128 176(b11=%ymm0 +vbroadcasti128 176(%rdx), %ymm0 + +# qhasm: a5[0] = mem128[ input_1 + 80 ] +# asm 1: vinsertf128 $0x0,80(r16=reg256#3 +# asm 2: vpand r16=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 512 ] = r16 +# asm 1: vmovupd r15=reg256#4 +# asm 2: vpand r15=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r14=reg256#6 +# asm 2: vpand r14=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r13=reg256#8 +# asm 2: vpand r13=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r12=reg256#10 +# asm 2: vpand r12=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r11=reg256#1 +# asm 2: vpand r11=%ymm0 +vpand %ymm0,%ymm10,%ymm0 + +# qhasm: b10 = mem128[ input_2 + 160 ] x2 +# asm 1: vbroadcasti128 160(b10=reg256#12 +# asm 2: vbroadcasti128 160(b10=%ymm11 +vbroadcasti128 160(%rdx), %ymm11 + +# qhasm: r = b10 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm2,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm4,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm6,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm8,%ymm3 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#4 +# asm 2: vpand r10=%ymm3 +vpand %ymm11,%ymm10,%ymm3 + +# qhasm: b9 = mem128[ input_2 + 144 ] x2 +# asm 1: vbroadcasti128 144(b9=reg256#12 +# asm 2: vbroadcasti128 144(b9=%ymm11 +vbroadcasti128 144(%rdx), %ymm11 + +# qhasm: r = b9 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm2,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm4,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm6,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm8,%ymm5 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#6 +# asm 2: vpand r9=%ymm5 +vpand %ymm11,%ymm10,%ymm5 + +# qhasm: b8 = mem128[ input_2 + 128 ] x2 +# asm 1: vbroadcasti128 128(b8=reg256#12 +# asm 2: vbroadcasti128 128(b8=%ymm11 +vbroadcasti128 128(%rdx), %ymm11 + +# qhasm: r = b8 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm2,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm4,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm6,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm8,%ymm7 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#8 +# asm 2: vpand r8=%ymm7 +vpand %ymm11,%ymm10,%ymm7 + +# qhasm: b7 = mem128[ input_2 + 112 ] x2 +# asm 1: vbroadcasti128 112(b7=reg256#12 +# asm 2: vbroadcasti128 112(b7=%ymm11 +vbroadcasti128 112(%rdx), %ymm11 + +# qhasm: r = b7 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm2,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm4,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm6,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm8,%ymm9 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#10 +# asm 2: vpand r7=%ymm9 +vpand %ymm11,%ymm10,%ymm9 + +# qhasm: b6 = mem128[ input_2 + 96 ] x2 +# asm 1: vbroadcasti128 96(b6=reg256#12 +# asm 2: vbroadcasti128 96(b6=%ymm11 +vbroadcasti128 96(%rdx), %ymm11 + +# qhasm: r = b6 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm2,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm4,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm6,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm8,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm11,%ymm10,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 80 ] x2 +# asm 1: vbroadcasti128 80(b5=reg256#12 +# asm 2: vbroadcasti128 80(b5=%ymm11 +vbroadcasti128 80(%rdx), %ymm11 + +# qhasm: r = b5 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm2,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm4,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm6,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm8,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm11,%ymm10,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 64 ] x2 +# asm 1: vbroadcasti128 64(b4=reg256#12 +# asm 2: vbroadcasti128 64(b4=%ymm11 +vbroadcasti128 64(%rdx), %ymm11 + +# qhasm: r = b4 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm2,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm4,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm6,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm8,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm11,%ymm10,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 48 ] x2 +# asm 1: vbroadcasti128 48(b3=reg256#12 +# asm 2: vbroadcasti128 48(b3=%ymm11 +vbroadcasti128 48(%rdx), %ymm11 + +# qhasm: r = b3 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm2,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm4,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm6,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm8,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm11,%ymm10,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 32 ] x2 +# asm 1: vbroadcasti128 32(b2=reg256#12 +# asm 2: vbroadcasti128 32(b2=%ymm11 +vbroadcasti128 32(%rdx), %ymm11 + +# qhasm: r = b2 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm2,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm4,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm6,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm8,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm11,%ymm10,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 16 ] x2 +# asm 1: vbroadcasti128 16(b1=reg256#12 +# asm 2: vbroadcasti128 16(b1=%ymm11 +vbroadcasti128 16(%rdx), %ymm11 + +# qhasm: r = b1 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#1 +# asm 2: vpand r1=%ymm0 +vpand %ymm11,%ymm10,%ymm0 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#12 +# asm 2: vbroadcasti128 0(b0=%ymm11 +vbroadcasti128 0(%rdx), %ymm11 + +# qhasm: r = b0 & a5 +# asm 1: vpand r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm1,%ymm1 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm2,%ymm1 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm4,%ymm1 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm6,%ymm1 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm8,%ymm1 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#2 +# asm 2: vpand r0=%ymm1 +vpand %ymm11,%ymm10,%ymm1 + +# qhasm: mem256[ ptr + 128 ] = r4 +# asm 1: vmovupd h22=reg128#1 +# asm 2: movdqu 528(h22=%xmm0 +movdqu 528(%rcx),%xmm0 + +# qhasm: h13 = h22 +# asm 1: movdqa h13=reg128#2 +# asm 2: movdqa h13=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h10 = h22 +# asm 1: movdqa h10=reg128#1 +# asm 2: movdqa h10=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h21 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h21=reg128#3 +# asm 2: movdqu 496(h21=%xmm2 +movdqu 496(%rcx),%xmm2 + +# qhasm: h12 = h21 +# asm 1: movdqa h12=reg128#4 +# asm 2: movdqa h12=%xmm3 +movdqa %xmm2,%xmm3 + +# qhasm: h9 = h21 +# asm 1: movdqa h9=reg128#3 +# asm 2: movdqa h9=%xmm2 +movdqa %xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h20=reg128#5 +# asm 2: movdqu 464(h20=%xmm4 +movdqu 464(%rcx),%xmm4 + +# qhasm: h11 = h20 +# asm 1: movdqa h11=reg128#6 +# asm 2: movdqa h11=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h8 = h20 +# asm 1: movdqa h8=reg128#5 +# asm 2: movdqa h8=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: h19 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h19=reg128#7 +# asm 2: movdqu 432(h19=%xmm6 +movdqu 432(%rcx),%xmm6 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#1 +# asm 2: vpxor h10=%xmm0 +vpxor %xmm6,%xmm0,%xmm0 + +# qhasm: h7 = h19 +# asm 1: movdqa h7=reg128#7 +# asm 2: movdqa h7=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: h18 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h18=reg128#8 +# asm 2: movdqu 400(h18=%xmm7 +movdqu 400(%rcx),%xmm7 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#3 +# asm 2: vpxor h9=%xmm2 +vpxor %xmm7,%xmm2,%xmm2 + +# qhasm: h6 = h18 +# asm 1: movdqa h6=reg128#8 +# asm 2: movdqa h6=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: h17 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h17=reg128#9 +# asm 2: movdqu 368(h17=%xmm8 +movdqu 368(%rcx),%xmm8 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#5 +# asm 2: vpxor h8=%xmm4 +vpxor %xmm8,%xmm4,%xmm4 + +# qhasm: h5 = h17 +# asm 1: movdqa h5=reg128#9 +# asm 2: movdqa h5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: h16 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h16=reg128#10 +# asm 2: movdqu 336(h16=%xmm9 +movdqu 336(%rcx),%xmm9 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#10 +# asm 2: vpxor 512(h16=%xmm9 +vpxor 512(%rcx),%xmm9,%xmm9 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#7 +# asm 2: vpxor h7=%xmm6 +vpxor %xmm9,%xmm6,%xmm6 + +# qhasm: h4 = h16 +# asm 1: movdqa h4=reg128#10 +# asm 2: movdqa h4=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: h15 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h15=reg128#11 +# asm 2: movdqu 304(h15=%xmm10 +movdqu 304(%rcx),%xmm10 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#11 +# asm 2: vpxor 480(h15=%xmm10 +vpxor 480(%rcx),%xmm10,%xmm10 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#8 +# asm 2: vpxor h6=%xmm7 +vpxor %xmm10,%xmm7,%xmm7 + +# qhasm: h3 = h15 +# asm 1: movdqa h3=reg128#11 +# asm 2: movdqa h3=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: h14 = mem128[ ptr + 272 ] +# asm 1: movdqu 272(h14=reg128#12 +# asm 2: movdqu 272(h14=%xmm11 +movdqu 272(%rcx),%xmm11 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#12 +# asm 2: vpxor 448(h14=%xmm11 +vpxor 448(%rcx),%xmm11,%xmm11 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#9 +# asm 2: vpxor h5=%xmm8 +vpxor %xmm11,%xmm8,%xmm8 + +# qhasm: h2 = h14 +# asm 1: movdqa h2=reg128#12 +# asm 2: movdqa h2=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: h13 = h13 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h13=reg128#2 +# asm 2: vpxor 240(h13=%xmm1 +vpxor 240(%rcx),%xmm1,%xmm1 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#2 +# asm 2: vpxor 416(h13=%xmm1 +vpxor 416(%rcx),%xmm1,%xmm1 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#10 +# asm 2: vpxor h4=%xmm9 +vpxor %xmm1,%xmm9,%xmm9 + +# qhasm: h1 = h13 +# asm 1: movdqa h1=reg128#2 +# asm 2: movdqa h1=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: h12 = h12 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h12=reg128#4 +# asm 2: vpxor 208(h12=%xmm3 +vpxor 208(%rcx),%xmm3,%xmm3 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#4 +# asm 2: vpxor 384(h12=%xmm3 +vpxor 384(%rcx),%xmm3,%xmm3 + +# qhasm: h3 = h3 ^ h12 +# asm 1: vpxor h3=reg128#11 +# asm 2: vpxor h3=%xmm10 +vpxor %xmm3,%xmm10,%xmm10 + +# qhasm: h0 = h12 +# asm 1: movdqa h0=reg128#4 +# asm 2: movdqa h0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: h11 = h11 ^ mem128[ ptr + 352 ] +# asm 1: vpxor 352(h11=reg128#6 +# asm 2: vpxor 352(h11=%xmm5 +vpxor 352(%rcx),%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h11=reg128#6 +# asm 2: vpxor 176(h11=%xmm5 +vpxor 176(%rcx),%xmm5,%xmm5 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#1 +# asm 2: vpxor 320(h10=%xmm0 +vpxor 320(%rcx),%xmm0,%xmm0 + +# qhasm: h10 = h10 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h10=reg128#1 +# asm 2: vpxor 144(h10=%xmm0 +vpxor 144(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#1 +# asm 2: vpxor 288(h9=%xmm0 +vpxor 288(%rcx),%xmm2,%xmm0 + +# qhasm: h9 = h9 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h9=reg128#1 +# asm 2: vpxor 112(h9=%xmm0 +vpxor 112(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#1 +# asm 2: vpxor 256(h8=%xmm0 +vpxor 256(%rcx),%xmm4,%xmm0 + +# qhasm: h8 = h8 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h8=reg128#1 +# asm 2: vpxor 80(h8=%xmm0 +vpxor 80(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#1 +# asm 2: vpxor 224(h7=%xmm0 +vpxor 224(%rcx),%xmm6,%xmm0 + +# qhasm: h7 = h7 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h7=reg128#1 +# asm 2: vpxor 48(h7=%xmm0 +vpxor 48(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%rcx),%xmm7,%xmm0 + +# qhasm: h6 = h6 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h6=reg128#1 +# asm 2: vpxor 16(h6=%xmm0 +vpxor 16(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%rcx),%xmm8,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%rcx),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%rcx),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%rcx),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%rcx),%xmm1,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%rcx),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE348864F_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE348864F_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE348864F_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE348864F_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864F_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE348864F_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE348864F_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE348864F_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE348864F_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/crypto_kem/mceliece348864f/avx/vec256_mul_asm.S b/crypto_kem/mceliece348864f/avx/vec256_mul_asm.S new file mode 100644 index 00000000..e12810b2 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/vec256_mul_asm.S @@ -0,0 +1,1736 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r + +# qhasm: enter vec256_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_vec256_mul_asm +.global PQCLEAN_MCELIECE348864F_AVX_vec256_mul_asm +_PQCLEAN_MCELIECE348864F_AVX_vec256_mul_asm: +PQCLEAN_MCELIECE348864F_AVX_vec256_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#2 +# asm 2: vmovupd 352(a11=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: r11 = a11 & b0 +# asm 1: vpand r11=reg256#3 +# asm 2: vpand r11=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r12 = a11 & mem256[input_2 + 32] +# asm 1: vpand 32(r12=reg256#4 +# asm 2: vpand 32(r12=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r13 = a11 & mem256[input_2 + 64] +# asm 1: vpand 64(r13=reg256#5 +# asm 2: vpand 64(r13=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r14 = a11 & mem256[input_2 + 96] +# asm 1: vpand 96(r14=reg256#6 +# asm 2: vpand 96(r14=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r15 = a11 & mem256[input_2 + 128] +# asm 1: vpand 128(r15=reg256#7 +# asm 2: vpand 128(r15=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r16 = a11 & mem256[input_2 + 160] +# asm 1: vpand 160(r16=reg256#8 +# asm 2: vpand 160(r16=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r17 = a11 & mem256[input_2 + 192] +# asm 1: vpand 192(r17=reg256#9 +# asm 2: vpand 192(r17=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r18 = a11 & mem256[input_2 + 224] +# asm 1: vpand 224(r18=reg256#10 +# asm 2: vpand 224(r18=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r19 = a11 & mem256[input_2 + 256] +# asm 1: vpand 256(r19=reg256#11 +# asm 2: vpand 256(r19=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r20 = a11 & mem256[input_2 + 288] +# asm 1: vpand 288(r20=reg256#12 +# asm 2: vpand 288(r20=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r21 = a11 & mem256[input_2 + 320] +# asm 1: vpand 320(r21=reg256#13 +# asm 2: vpand 320(r21=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r22 = a11 & mem256[input_2 + 352] +# asm 1: vpand 352(r22=reg256#2 +# asm 2: vpand 352(r22=%ymm1 +vpand 352(%rdx),%ymm1,%ymm1 + +# qhasm: r13 ^= r22 +# asm 1: vpxor r10=reg256#2 +# asm 2: vmovapd r10=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#14 +# asm 2: vmovupd 320(a10=%ymm13 +vmovupd 320(%rsi),%ymm13 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r21 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#14 +# asm 2: vmovupd 288(a9=%ymm13 +vmovupd 288(%rsi),%ymm13 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r20 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#14 +# asm 2: vmovupd 256(a8=%ymm13 +vmovupd 256(%rsi),%ymm13 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r19 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#14 +# asm 2: vmovupd 224(a7=%ymm13 +vmovupd 224(%rsi),%ymm13 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r18 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#14 +# asm 2: vmovupd 192(a6=%ymm13 +vmovupd 192(%rsi),%ymm13 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r17 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#14 +# asm 2: vmovupd 160(a5=%ymm13 +vmovupd 160(%rsi),%ymm13 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r16 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#14 +# asm 2: vmovupd 128(a4=%ymm13 +vmovupd 128(%rsi),%ymm13 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r15 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#14 +# asm 2: vmovupd 96(a3=%ymm13 +vmovupd 96(%rsi),%ymm13 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r14 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#14 +# asm 2: vmovupd 64(a2=%ymm13 +vmovupd 64(%rsi),%ymm13 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r13 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#14 +# asm 2: vmovupd 32(a1=%ymm13 +vmovupd 32(%rsi),%ymm13 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r12 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#14 +# asm 2: vmovupd 0(a0=%ymm13 +vmovupd 0(%rsi),%ymm13 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm13,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm13,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm13,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm13,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm13,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm13,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm13,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm13,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm13,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm13,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm13,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r11_stack=stack64#1 +# asm 2: movq r11_stack=608(%rsp) +movq %r11,608(%rsp) + +# qhasm: r12_stack = caller_r12 +# asm 1: movq r12_stack=stack64#2 +# asm 2: movq r12_stack=616(%rsp) +movq %r12,616(%rsp) + +# qhasm: r13_stack = caller_r13 +# asm 1: movq r13_stack=stack64#3 +# asm 2: movq r13_stack=624(%rsp) +movq %r13,624(%rsp) + +# qhasm: r14_stack = caller_r14 +# asm 1: movq r14_stack=stack64#4 +# asm 2: movq r14_stack=632(%rsp) +movq %r14,632(%rsp) + +# qhasm: r15_stack = caller_r15 +# asm 1: movq r15_stack=stack64#5 +# asm 2: movq r15_stack=640(%rsp) +movq %r15,640(%rsp) + +# qhasm: rbx_stack = caller_rbx +# asm 1: movq rbx_stack=stack64#6 +# asm 2: movq rbx_stack=648(%rsp) +movq %rbx,648(%rsp) + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#4 +# asm 2: leaq ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: s0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(s0=reg256#1 +# asm 2: vmovupd 0(s0=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: s1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(s1=reg256#2 +# asm 2: vmovupd 32(s1=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: s2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(s2=reg256#3 +# asm 2: vmovupd 64(s2=%ymm2 +vmovupd 64(%rsi),%ymm2 + +# qhasm: t0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(t0=reg256#4 +# asm 2: vmovupd 0(t0=%ymm3 +vmovupd 0(%rdx),%ymm3 + +# qhasm: t1 = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(t1=reg256#5 +# asm 2: vmovupd 32(t1=%ymm4 +vmovupd 32(%rdx),%ymm4 + +# qhasm: t2 = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(t2=reg256#6 +# asm 2: vmovupd 64(t2=%ymm5 +vmovupd 64(%rdx),%ymm5 + +# qhasm: a5[0,1,2,3] = s2[2,2,3,3] +# asm 1: vpermq $0xfa,a5=reg256#7 +# asm 2: vpermq $0xfa,a5=%ymm6 +vpermq $0xfa,%ymm2,%ymm6 + +# qhasm: b5[0,1,2,3] = t2[2,3,2,3] +# asm 1: vpermq $0xee,b5=reg256#8 +# asm 2: vpermq $0xee,b5=%ymm7 +vpermq $0xee,%ymm5,%ymm7 + +# qhasm: r10 = a5 & b5 +# asm 1: vpand r10=reg256#9 +# asm 2: vpand r10=%ymm8 +vpand %ymm6,%ymm7,%ymm8 + +# qhasm: mem256[ ptr + 320 ] = r10 +# asm 1: vmovupd b4=reg256#6 +# asm 2: vpermq $0x44,b4=%ymm5 +vpermq $0x44,%ymm5,%ymm5 + +# qhasm: r9 = a5 & b4 +# asm 1: vpand r9=reg256#9 +# asm 2: vpand r9=%ymm8 +vpand %ymm6,%ymm5,%ymm8 + +# qhasm: b3[0,1,2,3] = t1[2,3,2,3] +# asm 1: vpermq $0xee,b3=reg256#10 +# asm 2: vpermq $0xee,b3=%ymm9 +vpermq $0xee,%ymm4,%ymm9 + +# qhasm: r8 = a5 & b3 +# asm 1: vpand r8=reg256#11 +# asm 2: vpand r8=%ymm10 +vpand %ymm6,%ymm9,%ymm10 + +# qhasm: b2[0,1,2,3] = t1[0,1,0,1] +# asm 1: vpermq $0x44,b2=reg256#5 +# asm 2: vpermq $0x44,b2=%ymm4 +vpermq $0x44,%ymm4,%ymm4 + +# qhasm: r7 = a5 & b2 +# asm 1: vpand r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm6,%ymm4,%ymm11 + +# qhasm: b1[0,1,2,3] = t0[2,3,2,3] +# asm 1: vpermq $0xee,b1=reg256#13 +# asm 2: vpermq $0xee,b1=%ymm12 +vpermq $0xee,%ymm3,%ymm12 + +# qhasm: r6 = a5 & b1 +# asm 1: vpand r6=reg256#14 +# asm 2: vpand r6=%ymm13 +vpand %ymm6,%ymm12,%ymm13 + +# qhasm: b0[0,1,2,3] = t0[0,1,0,1] +# asm 1: vpermq $0x44,b0=reg256#4 +# asm 2: vpermq $0x44,b0=%ymm3 +vpermq $0x44,%ymm3,%ymm3 + +# qhasm: r5 = a5 & b0 +# asm 1: vpand r5=reg256#7 +# asm 2: vpand r5=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: a4[0,1,2,3] = s2[0,0,1,1] +# asm 1: vpermq $0x50,a4=reg256#3 +# asm 2: vpermq $0x50,a4=%ymm2 +vpermq $0x50,%ymm2,%ymm2 + +# qhasm: r = a4 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm2,%ymm7,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm5,%ymm8 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm9,%ymm8 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm4,%ymm8 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm12,%ymm8 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#3 +# asm 2: vpand r4=%ymm2 +vpand %ymm2,%ymm3,%ymm2 + +# qhasm: a3[0,1,2,3] = s1[2,2,3,3] +# asm 1: vpermq $0xfa,a3=reg256#9 +# asm 2: vpermq $0xfa,a3=%ymm8 +vpermq $0xfa,%ymm1,%ymm8 + +# qhasm: r = a3 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm8,%ymm7,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm5,%ymm10 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm9,%ymm10 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm4,%ymm10 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm12,%ymm10 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#9 +# asm 2: vpand r3=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: a2[0,1,2,3] = s1[0,0,1,1] +# asm 1: vpermq $0x50,a2=reg256#2 +# asm 2: vpermq $0x50,a2=%ymm1 +vpermq $0x50,%ymm1,%ymm1 + +# qhasm: r = a2 & b5 +# asm 1: vpand r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm7,%ymm10 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm5,%ymm10 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm9,%ymm10 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm4,%ymm10 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm12,%ymm10 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#2 +# asm 2: vpand r2=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: a1[0,1,2,3] = s0[2,2,3,3] +# asm 1: vpermq $0xfa,a1=reg256#11 +# asm 2: vpermq $0xfa,a1=%ymm10 +vpermq $0xfa,%ymm0,%ymm10 + +# qhasm: r = a1 & b5 +# asm 1: vpand r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm7,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm5,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm9,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm4,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm12,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#11 +# asm 2: vpand r1=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: a0[0,1,2,3] = s0[0,0,1,1] +# asm 1: vpermq $0x50,a0=reg256#1 +# asm 2: vpermq $0x50,a0=%ymm0 +vpermq $0x50,%ymm0,%ymm0 + +# qhasm: r = a0 & b5 +# asm 1: vpand r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm0,%ymm7,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm0,%ymm5,%ymm5 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm0,%ymm9,%ymm5 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#5 +# asm 2: vpand r=%ymm4 +vpand %ymm0,%ymm4,%ymm4 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#5 +# asm 2: vpand r=%ymm4 +vpand %ymm0,%ymm12,%ymm4 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: mem256[ ptr + 128 ] = r4 +# asm 1: vmovupd h22=int64#2 +# asm 2: movq 344(h22=%rsi +movq 344(%rcx),%rsi + +# qhasm: h13 = h22 +# asm 1: mov h13=int64#3 +# asm 2: mov h13=%rdx +mov %rsi,%rdx + +# qhasm: h10 = h22 +# asm 1: mov h10=int64#2 +# asm 2: mov h10=%rsi +mov %rsi,%rsi + +# qhasm: h21 = mem64[ ptr + 336 ] +# asm 1: movq 336(h21=int64#5 +# asm 2: movq 336(h21=%r8 +movq 336(%rcx),%r8 + +# qhasm: h21 ^= *(uint64 *) ( ptr + 328 ) +# asm 1: xorq 328(h12=int64#6 +# asm 2: mov h12=%r9 +mov %r8,%r9 + +# qhasm: h9 = h21 +# asm 1: mov h9=int64#5 +# asm 2: mov h9=%r8 +mov %r8,%r8 + +# qhasm: h20 = mem64[ ptr + 312 ] +# asm 1: movq 312(h20=int64#7 +# asm 2: movq 312(h20=%rax +movq 312(%rcx),%rax + +# qhasm: h20 ^= *(uint64 *) ( ptr + 320 ) +# asm 1: xorq 320(h11=int64#8 +# asm 2: mov h11=%r10 +mov %rax,%r10 + +# qhasm: h8 = h20 +# asm 1: mov h8=int64#7 +# asm 2: mov h8=%rax +mov %rax,%rax + +# qhasm: h19 = mem64[ ptr + 304 ] +# asm 1: movq 304(h19=int64#9 +# asm 2: movq 304(h19=%r11 +movq 304(%rcx),%r11 + +# qhasm: h19 ^= *(uint64 *) ( ptr + 296 ) +# asm 1: xorq 296(h7=int64#9 +# asm 2: mov h7=%r11 +mov %r11,%r11 + +# qhasm: h18 = mem64[ ptr + 280 ] +# asm 1: movq 280(h18=int64#10 +# asm 2: movq 280(h18=%r12 +movq 280(%rcx),%r12 + +# qhasm: h18 ^= *(uint64 *) ( ptr + 288 ) +# asm 1: xorq 288(h6=int64#10 +# asm 2: mov h6=%r12 +mov %r12,%r12 + +# qhasm: h17 = mem64[ ptr + 272 ] +# asm 1: movq 272(h17=int64#11 +# asm 2: movq 272(h17=%r13 +movq 272(%rcx),%r13 + +# qhasm: h17 ^= *(uint64 *) ( ptr + 264 ) +# asm 1: xorq 264(h5=int64#11 +# asm 2: mov h5=%r13 +mov %r13,%r13 + +# qhasm: h16 = mem64[ ptr + 248 ] +# asm 1: movq 248(h16=int64#12 +# asm 2: movq 248(h16=%r14 +movq 248(%rcx),%r14 + +# qhasm: h16 ^= *(uint64 *) ( ptr + 256 ) +# asm 1: xorq 256(h4=int64#12 +# asm 2: mov h4=%r14 +mov %r14,%r14 + +# qhasm: h15 = mem64[ ptr + 240 ] +# asm 1: movq 240(h15=int64#13 +# asm 2: movq 240(h15=%r15 +movq 240(%rcx),%r15 + +# qhasm: h15 ^= *(uint64 *) ( ptr + 232 ) +# asm 1: xorq 232(h3=int64#13 +# asm 2: mov h3=%r15 +mov %r15,%r15 + +# qhasm: h14 = mem64[ ptr + 216 ] +# asm 1: movq 216(h14=int64#14 +# asm 2: movq 216(h14=%rbx +movq 216(%rcx),%rbx + +# qhasm: h14 ^= *(uint64 *) ( ptr + 224 ) +# asm 1: xorq 224(h2=int64#14 +# asm 2: mov h2=%rbx +mov %rbx,%rbx + +# qhasm: h13 ^= *(uint64 *) ( ptr + 208 ) +# asm 1: xorq 208(h1=int64#3 +# asm 2: mov h1=%rdx +mov %rdx,%rdx + +# qhasm: h12 ^= *(uint64 *) ( ptr + 184 ) +# asm 1: xorq 184(h0=int64#6 +# asm 2: mov h0=%r9 +mov %r9,%r9 + +# qhasm: h11 ^= *(uint64 *) ( ptr + 176 ) +# asm 1: xorq 176(caller_r11=int64#9 +# asm 2: movq caller_r11=%r11 +movq 608(%rsp),%r11 + +# qhasm: caller_r12 = r12_stack +# asm 1: movq caller_r12=int64#10 +# asm 2: movq caller_r12=%r12 +movq 616(%rsp),%r12 + +# qhasm: caller_r13 = r13_stack +# asm 1: movq caller_r13=int64#11 +# asm 2: movq caller_r13=%r13 +movq 624(%rsp),%r13 + +# qhasm: caller_r14 = r14_stack +# asm 1: movq caller_r14=int64#12 +# asm 2: movq caller_r14=%r14 +movq 632(%rsp),%r14 + +# qhasm: caller_r15 = r15_stack +# asm 1: movq caller_r15=int64#13 +# asm 2: movq caller_r15=%r15 +movq 640(%rsp),%r15 + +# qhasm: caller_rbx = rbx_stack +# asm 1: movq caller_rbx=int64#14 +# asm 2: movq caller_rbx=%rbx +movq 648(%rsp),%rbx + +# qhasm: return +add %r11,%rsp +ret diff --git a/crypto_kem/mceliece348864f/avx/vec_mul_sp_asm.S b/crypto_kem/mceliece348864f/avx/vec_mul_sp_asm.S new file mode 100644 index 00000000..fe493cdf --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/vec_mul_sp_asm.S @@ -0,0 +1,1115 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 s0 + +# qhasm: reg256 s1 + +# qhasm: reg256 s2 + +# qhasm: reg256 s3 + +# qhasm: reg256 s4 + +# qhasm: reg256 s5 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r + +# qhasm: int64 h0 + +# qhasm: int64 h1 + +# qhasm: int64 h2 + +# qhasm: int64 h3 + +# qhasm: int64 h4 + +# qhasm: int64 h5 + +# qhasm: int64 h6 + +# qhasm: int64 h7 + +# qhasm: int64 h8 + +# qhasm: int64 h9 + +# qhasm: int64 h10 + +# qhasm: int64 h11 + +# qhasm: int64 h12 + +# qhasm: int64 h13 + +# qhasm: int64 h14 + +# qhasm: int64 h15 + +# qhasm: int64 h16 + +# qhasm: int64 h17 + +# qhasm: int64 h18 + +# qhasm: int64 h19 + +# qhasm: int64 h20 + +# qhasm: int64 h21 + +# qhasm: int64 h22 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: stack64 r11_stack + +# qhasm: stack64 r12_stack + +# qhasm: stack64 r13_stack + +# qhasm: stack64 r14_stack + +# qhasm: stack64 r15_stack + +# qhasm: stack64 rbx_stack + +# qhasm: stack64 rbp_stack + +# qhasm: enter vec_mul_sp_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp_asm +.global PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp_asm +_PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp_asm: +PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp_asm: +mov %rsp,%r11 +and $31,%r11 +add $672,%r11 +sub %r11,%rsp + +# qhasm: r11_stack = caller_r11 +# asm 1: movq r11_stack=stack64#1 +# asm 2: movq r11_stack=608(%rsp) +movq %r11,608(%rsp) + +# qhasm: r12_stack = caller_r12 +# asm 1: movq r12_stack=stack64#2 +# asm 2: movq r12_stack=616(%rsp) +movq %r12,616(%rsp) + +# qhasm: r13_stack = caller_r13 +# asm 1: movq r13_stack=stack64#3 +# asm 2: movq r13_stack=624(%rsp) +movq %r13,624(%rsp) + +# qhasm: r14_stack = caller_r14 +# asm 1: movq r14_stack=stack64#4 +# asm 2: movq r14_stack=632(%rsp) +movq %r14,632(%rsp) + +# qhasm: r15_stack = caller_r15 +# asm 1: movq r15_stack=stack64#5 +# asm 2: movq r15_stack=640(%rsp) +movq %r15,640(%rsp) + +# qhasm: rbx_stack = caller_rbx +# asm 1: movq rbx_stack=stack64#6 +# asm 2: movq rbx_stack=648(%rsp) +movq %rbx,648(%rsp) + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#4 +# asm 2: leaq ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: s0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(s0=reg256#1 +# asm 2: vmovupd 0(s0=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: s1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(s1=reg256#2 +# asm 2: vmovupd 32(s1=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: s2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(s2=reg256#3 +# asm 2: vmovupd 64(s2=%ymm2 +vmovupd 64(%rsi),%ymm2 + +# qhasm: a5[0,1,2,3] = s2[2,2,3,3] +# asm 1: vpermq $0xfa,a5=reg256#4 +# asm 2: vpermq $0xfa,a5=%ymm3 +vpermq $0xfa,%ymm2,%ymm3 + +# qhasm: r = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(r=reg256#5 +# asm 2: vmovupd 160(r=%ymm4 +vmovupd 160(%rdx),%ymm4 + +# qhasm: b5[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b5=reg256#5 +# asm 2: vpermq $0xdd,b5=%ymm4 +vpermq $0xdd,%ymm4,%ymm4 + +# qhasm: r10 = a5 & b5 +# asm 1: vpand r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm3,%ymm4,%ymm5 + +# qhasm: mem256[ ptr + 320 ] = r10 +# asm 1: vmovupd r=reg256#6 +# asm 2: vmovupd 128(r=%ymm5 +vmovupd 128(%rdx),%ymm5 + +# qhasm: b4[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b4=reg256#6 +# asm 2: vpermq $0xdd,b4=%ymm5 +vpermq $0xdd,%ymm5,%ymm5 + +# qhasm: r9 = a5 & b4 +# asm 1: vpand r9=reg256#7 +# asm 2: vpand r9=%ymm6 +vpand %ymm3,%ymm5,%ymm6 + +# qhasm: r = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(r=reg256#8 +# asm 2: vmovupd 96(r=%ymm7 +vmovupd 96(%rdx),%ymm7 + +# qhasm: b3[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b3=reg256#8 +# asm 2: vpermq $0xdd,b3=%ymm7 +vpermq $0xdd,%ymm7,%ymm7 + +# qhasm: r8 = a5 & b3 +# asm 1: vpand r8=reg256#9 +# asm 2: vpand r8=%ymm8 +vpand %ymm3,%ymm7,%ymm8 + +# qhasm: r = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(r=reg256#10 +# asm 2: vmovupd 64(r=%ymm9 +vmovupd 64(%rdx),%ymm9 + +# qhasm: b2[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b2=reg256#10 +# asm 2: vpermq $0xdd,b2=%ymm9 +vpermq $0xdd,%ymm9,%ymm9 + +# qhasm: r7 = a5 & b2 +# asm 1: vpand r7=reg256#11 +# asm 2: vpand r7=%ymm10 +vpand %ymm3,%ymm9,%ymm10 + +# qhasm: r = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(r=reg256#12 +# asm 2: vmovupd 32(r=%ymm11 +vmovupd 32(%rdx),%ymm11 + +# qhasm: b1[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b1=reg256#12 +# asm 2: vpermq $0xdd,b1=%ymm11 +vpermq $0xdd,%ymm11,%ymm11 + +# qhasm: r6 = a5 & b1 +# asm 1: vpand r6=reg256#13 +# asm 2: vpand r6=%ymm12 +vpand %ymm3,%ymm11,%ymm12 + +# qhasm: r = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(r=reg256#14 +# asm 2: vmovupd 0(r=%ymm13 +vmovupd 0(%rdx),%ymm13 + +# qhasm: b0[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b0=reg256#14 +# asm 2: vpermq $0xdd,b0=%ymm13 +vpermq $0xdd,%ymm13,%ymm13 + +# qhasm: r5 = a5 & b0 +# asm 1: vpand r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm3,%ymm13,%ymm3 + +# qhasm: a4[0,1,2,3] = s2[0,0,1,1] +# asm 1: vpermq $0x50,a4=reg256#3 +# asm 2: vpermq $0x50,a4=%ymm2 +vpermq $0x50,%ymm2,%ymm2 + +# qhasm: r = a4 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm2,%ymm4,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm5,%ymm6 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm7,%ymm6 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm9,%ymm6 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm11,%ymm6 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#3 +# asm 2: vpand r4=%ymm2 +vpand %ymm2,%ymm13,%ymm2 + +# qhasm: a3[0,1,2,3] = s1[2,2,3,3] +# asm 1: vpermq $0xfa,a3=reg256#7 +# asm 2: vpermq $0xfa,a3=%ymm6 +vpermq $0xfa,%ymm1,%ymm6 + +# qhasm: r = a3 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm6,%ymm4,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm5,%ymm8 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm7,%ymm8 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm9,%ymm8 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm11,%ymm8 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vpand r3=%ymm6 +vpand %ymm6,%ymm13,%ymm6 + +# qhasm: a2[0,1,2,3] = s1[0,0,1,1] +# asm 1: vpermq $0x50,a2=reg256#2 +# asm 2: vpermq $0x50,a2=%ymm1 +vpermq $0x50,%ymm1,%ymm1 + +# qhasm: r = a2 & b5 +# asm 1: vpand r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm4,%ymm8 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm5,%ymm8 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm7,%ymm8 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm9,%ymm8 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm11,%ymm8 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#2 +# asm 2: vpand r2=%ymm1 +vpand %ymm1,%ymm13,%ymm1 + +# qhasm: a1[0,1,2,3] = s0[2,2,3,3] +# asm 1: vpermq $0xfa,a1=reg256#9 +# asm 2: vpermq $0xfa,a1=%ymm8 +vpermq $0xfa,%ymm0,%ymm8 + +# qhasm: r = a1 & b5 +# asm 1: vpand r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm4,%ymm10 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm5,%ymm10 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm7,%ymm10 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm9,%ymm10 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm11,%ymm10 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#9 +# asm 2: vpand r1=%ymm8 +vpand %ymm8,%ymm13,%ymm8 + +# qhasm: a0[0,1,2,3] = s0[0,0,1,1] +# asm 1: vpermq $0x50,a0=reg256#1 +# asm 2: vpermq $0x50,a0=%ymm0 +vpermq $0x50,%ymm0,%ymm0 + +# qhasm: r = a0 & b5 +# asm 1: vpand r=reg256#5 +# asm 2: vpand r=%ymm4 +vpand %ymm0,%ymm4,%ymm4 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm5,%ymm3 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm7,%ymm3 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm9,%ymm3 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm11,%ymm3 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm0,%ymm13,%ymm0 + +# qhasm: mem256[ ptr + 128 ] = r4 +# asm 1: vmovupd h22=int64#2 +# asm 2: movq 344(h22=%rsi +movq 344(%rcx),%rsi + +# qhasm: h13 = h22 +# asm 1: mov h13=int64#3 +# asm 2: mov h13=%rdx +mov %rsi,%rdx + +# qhasm: h10 = h22 +# asm 1: mov h10=int64#2 +# asm 2: mov h10=%rsi +mov %rsi,%rsi + +# qhasm: h21 = mem64[ ptr + 336 ] +# asm 1: movq 336(h21=int64#5 +# asm 2: movq 336(h21=%r8 +movq 336(%rcx),%r8 + +# qhasm: h21 ^= *(uint64 *) ( ptr + 328 ) +# asm 1: xorq 328(h12=int64#6 +# asm 2: mov h12=%r9 +mov %r8,%r9 + +# qhasm: h9 = h21 +# asm 1: mov h9=int64#5 +# asm 2: mov h9=%r8 +mov %r8,%r8 + +# qhasm: h20 = mem64[ ptr + 312 ] +# asm 1: movq 312(h20=int64#7 +# asm 2: movq 312(h20=%rax +movq 312(%rcx),%rax + +# qhasm: h20 ^= *(uint64 *) ( ptr + 320 ) +# asm 1: xorq 320(h11=int64#8 +# asm 2: mov h11=%r10 +mov %rax,%r10 + +# qhasm: h8 = h20 +# asm 1: mov h8=int64#7 +# asm 2: mov h8=%rax +mov %rax,%rax + +# qhasm: h19 = mem64[ ptr + 304 ] +# asm 1: movq 304(h19=int64#9 +# asm 2: movq 304(h19=%r11 +movq 304(%rcx),%r11 + +# qhasm: h19 ^= *(uint64 *) ( ptr + 296 ) +# asm 1: xorq 296(h7=int64#9 +# asm 2: mov h7=%r11 +mov %r11,%r11 + +# qhasm: h18 = mem64[ ptr + 280 ] +# asm 1: movq 280(h18=int64#10 +# asm 2: movq 280(h18=%r12 +movq 280(%rcx),%r12 + +# qhasm: h18 ^= *(uint64 *) ( ptr + 288 ) +# asm 1: xorq 288(h6=int64#10 +# asm 2: mov h6=%r12 +mov %r12,%r12 + +# qhasm: h17 = mem64[ ptr + 272 ] +# asm 1: movq 272(h17=int64#11 +# asm 2: movq 272(h17=%r13 +movq 272(%rcx),%r13 + +# qhasm: h17 ^= *(uint64 *) ( ptr + 264 ) +# asm 1: xorq 264(h5=int64#11 +# asm 2: mov h5=%r13 +mov %r13,%r13 + +# qhasm: h16 = mem64[ ptr + 248 ] +# asm 1: movq 248(h16=int64#12 +# asm 2: movq 248(h16=%r14 +movq 248(%rcx),%r14 + +# qhasm: h16 ^= *(uint64 *) ( ptr + 256 ) +# asm 1: xorq 256(h4=int64#12 +# asm 2: mov h4=%r14 +mov %r14,%r14 + +# qhasm: h15 = mem64[ ptr + 240 ] +# asm 1: movq 240(h15=int64#13 +# asm 2: movq 240(h15=%r15 +movq 240(%rcx),%r15 + +# qhasm: h15 ^= *(uint64 *) ( ptr + 232 ) +# asm 1: xorq 232(h3=int64#13 +# asm 2: mov h3=%r15 +mov %r15,%r15 + +# qhasm: h14 = mem64[ ptr + 216 ] +# asm 1: movq 216(h14=int64#14 +# asm 2: movq 216(h14=%rbx +movq 216(%rcx),%rbx + +# qhasm: h14 ^= *(uint64 *) ( ptr + 224 ) +# asm 1: xorq 224(h2=int64#14 +# asm 2: mov h2=%rbx +mov %rbx,%rbx + +# qhasm: h13 ^= *(uint64 *) ( ptr + 208 ) +# asm 1: xorq 208(h1=int64#3 +# asm 2: mov h1=%rdx +mov %rdx,%rdx + +# qhasm: h12 ^= *(uint64 *) ( ptr + 184 ) +# asm 1: xorq 184(h0=int64#6 +# asm 2: mov h0=%r9 +mov %r9,%r9 + +# qhasm: h11 ^= *(uint64 *) ( ptr + 176 ) +# asm 1: xorq 176(caller_r11=int64#9 +# asm 2: movq caller_r11=%r11 +movq 608(%rsp),%r11 + +# qhasm: caller_r12 = r12_stack +# asm 1: movq caller_r12=int64#10 +# asm 2: movq caller_r12=%r12 +movq 616(%rsp),%r12 + +# qhasm: caller_r13 = r13_stack +# asm 1: movq caller_r13=int64#11 +# asm 2: movq caller_r13=%r13 +movq 624(%rsp),%r13 + +# qhasm: caller_r14 = r14_stack +# asm 1: movq caller_r14=int64#12 +# asm 2: movq caller_r14=%r14 +movq 632(%rsp),%r14 + +# qhasm: caller_r15 = r15_stack +# asm 1: movq caller_r15=int64#13 +# asm 2: movq caller_r15=%r15 +movq 640(%rsp),%r15 + +# qhasm: caller_rbx = rbx_stack +# asm 1: movq caller_rbx=int64#14 +# asm 2: movq caller_rbx=%rbx +movq 648(%rsp),%rbx + +# qhasm: return +add %r11,%rsp +ret diff --git a/crypto_kem/mceliece348864f/avx/vec_reduce_asm.S b/crypto_kem/mceliece348864f/avx/vec_reduce_asm.S new file mode 100644 index 00000000..58a4c720 --- /dev/null +++ b/crypto_kem/mceliece348864f/avx/vec_reduce_asm.S @@ -0,0 +1,356 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 t + +# qhasm: int64 c + +# qhasm: int64 r + +# qhasm: enter vec_reduce_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_vec_reduce_asm +.global PQCLEAN_MCELIECE348864F_AVX_vec_reduce_asm +_PQCLEAN_MCELIECE348864F_AVX_vec_reduce_asm: +PQCLEAN_MCELIECE348864F_AVX_vec_reduce_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: r = 0 +# asm 1: mov $0,>r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t = mem64[ input_0 + 88 ] +# asm 1: movq 88(t=int64#2 +# asm 2: movq 88(t=%rsi +movq 88(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 80(t=%rsi +movq 80(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 72(t=%rsi +movq 72(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 64(t=%rsi +movq 64(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 56(t=%rsi +movq 56(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 48(t=%rsi +movq 48(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 40(t=%rsi +movq 40(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 32(t=%rsi +movq 32(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 24(t=%rsi +movq 24(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 16(t=%rsi +movq 16(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 8(t=%rsi +movq 8(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#1 +# asm 2: movq 0(t=%rdi +movq 0(%rdi),%rdi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rdi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE348864F_CLEAN_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece348864f/clean/api.h b/crypto_kem/mceliece348864f/clean/api.h new file mode 100644 index 00000000..ecf9c697 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/api.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_API_H +#define PQCLEAN_MCELIECE348864F_CLEAN_API_H + +#include + +#define PQCLEAN_MCELIECE348864F_CLEAN_CRYPTO_ALGNAME "Classic McEliece 348864f" +#define PQCLEAN_MCELIECE348864F_CLEAN_CRYPTO_PUBLICKEYBYTES 261120 +#define PQCLEAN_MCELIECE348864F_CLEAN_CRYPTO_SECRETKEYBYTES 6452 +#define PQCLEAN_MCELIECE348864F_CLEAN_CRYPTO_CIPHERTEXTBYTES 128 +#define PQCLEAN_MCELIECE348864F_CLEAN_CRYPTO_BYTES 32 + + + +int PQCLEAN_MCELIECE348864F_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE348864F_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE348864F_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece348864f/clean/benes.c b/crypto_kem/mceliece348864f/clean/benes.c new file mode 100644 index 00000000..931378c2 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/benes.c @@ -0,0 +1,139 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +/* one layer of the benes network */ +static void layer(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE348864F_CLEAN_apply_benes(unsigned char *r, const unsigned char *bits, int rev) { + int i; + + const unsigned char *cond_ptr; + int inc, low; + + uint64_t bs[64]; + uint64_t cond[64]; + + // + + for (i = 0; i < 64; i++) { + bs[i] = PQCLEAN_MCELIECE348864F_CLEAN_load8(r + i * 8); + } + + if (rev == 0) { + inc = 256; + cond_ptr = bits; + } else { + inc = -256; + cond_ptr = bits + (2 * GFBITS - 2) * 256; + } + + // + + PQCLEAN_MCELIECE348864F_CLEAN_transpose_64x64(bs, bs); + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_CLEAN_load4(cond_ptr + i * 4); + } + PQCLEAN_MCELIECE348864F_CLEAN_transpose_64x64(cond, cond); + layer(bs, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864F_CLEAN_transpose_64x64(bs, bs); + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 32; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_CLEAN_load8(cond_ptr + i * 8); + } + layer(bs, cond, low); + cond_ptr += inc; + } + for (low = 4; low >= 0; low--) { + for (i = 0; i < 32; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_CLEAN_load8(cond_ptr + i * 8); + } + layer(bs, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864F_CLEAN_transpose_64x64(bs, bs); + + for (low = 5; low >= 0; low--) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_CLEAN_load4(cond_ptr + i * 4); + } + PQCLEAN_MCELIECE348864F_CLEAN_transpose_64x64(cond, cond); + layer(bs, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864F_CLEAN_transpose_64x64(bs, bs); + + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE348864F_CLEAN_store8(r + i * 8, bs[i]); + } +} + +/* input: condition bits c */ +/* output: support s */ +void PQCLEAN_MCELIECE348864F_CLEAN_support_gen(gf *s, const unsigned char *c) { + gf a; + int i, j; + unsigned char L[ GFBITS ][ (1 << GFBITS) / 8 ]; + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < (1 << GFBITS) / 8; j++) { + L[i][j] = 0; + } + } + + for (i = 0; i < (1 << GFBITS); i++) { + a = PQCLEAN_MCELIECE348864F_CLEAN_bitrev((gf) i); + + for (j = 0; j < GFBITS; j++) { + L[j][ i / 8 ] |= ((a >> j) & 1) << (i % 8); + } + } + + for (j = 0; j < GFBITS; j++) { + PQCLEAN_MCELIECE348864F_CLEAN_apply_benes(L[j], c, 0); + } + + for (i = 0; i < SYS_N; i++) { + s[i] = 0; + for (j = GFBITS - 1; j >= 0; j--) { + s[i] <<= 1; + s[i] |= (L[j][i / 8] >> (i % 8)) & 1; + } + } +} + diff --git a/crypto_kem/mceliece348864f/clean/benes.h b/crypto_kem/mceliece348864f/clean/benes.h new file mode 100644 index 00000000..1fdadad8 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_BENES_H +#define PQCLEAN_MCELIECE348864F_CLEAN_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "gf.h" + +void PQCLEAN_MCELIECE348864F_CLEAN_apply_benes(unsigned char * /*r*/, const unsigned char * /*bits*/, int /*rev*/); +void PQCLEAN_MCELIECE348864F_CLEAN_support_gen(gf * /*s*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/clean/bm.c b/crypto_kem/mceliece348864f/clean/bm.c new file mode 100644 index 00000000..51c64b7b --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/bm.c @@ -0,0 +1,83 @@ +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ +#include "bm.h" + +#include "params.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +/* the Berlekamp-Massey algorithm */ +/* input: s, sequence of field elements */ +/* output: out, minimal polynomial of s */ +void PQCLEAN_MCELIECE348864F_CLEAN_bm(gf *out, gf *s) { + int i; + + uint16_t N = 0; + uint16_t L = 0; + uint16_t mle; + uint16_t mne; + + gf T[ SYS_T + 1 ]; + gf C[ SYS_T + 1 ]; + gf B[ SYS_T + 1 ]; + + gf b = 1, d, f; + + // + + for (i = 0; i < SYS_T + 1; i++) { + C[i] = B[i] = 0; + } + + B[1] = C[0] = 1; + + // + + for (N = 0; N < 2 * SYS_T; N++) { + d = 0; + + for (i = 0; i <= min(N, SYS_T); i++) { + d ^= PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(C[i], s[ N - i]); + } + + mne = d; + mne -= 1; + mne >>= 15; + mne -= 1; + mle = N; + mle -= 2 * L; + mle >>= 15; + mle -= 1; + mle &= mne; + + for (i = 0; i <= SYS_T; i++) { + T[i] = C[i]; + } + + f = PQCLEAN_MCELIECE348864F_CLEAN_gf_frac(b, d); + + for (i = 0; i <= SYS_T; i++) { + C[i] ^= PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(f, B[i]) & mne; + } + + L = (L & ~mle) | ((N + 1 - L) & mle); + + for (i = 0; i <= SYS_T; i++) { + B[i] = (B[i] & ~mle) | (T[i] & mle); + } + + b = (b & ~mle) | (d & mle); + + for (i = SYS_T; i >= 1; i--) { + B[i] = B[i - 1]; + } + B[0] = 0; + } + + for (i = 0; i <= SYS_T; i++) { + out[i] = C[ SYS_T - i ]; + } +} + diff --git a/crypto_kem/mceliece348864f/clean/bm.h b/crypto_kem/mceliece348864f/clean/bm.h new file mode 100644 index 00000000..38a6dd45 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/bm.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_BM_H +#define PQCLEAN_MCELIECE348864F_CLEAN_BM_H +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE348864F_CLEAN_bm(gf * /*out*/, gf * /*s*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/clean/controlbits.c b/crypto_kem/mceliece348864f/clean/controlbits.c new file mode 100644 index 00000000..1cd634a0 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE348864F_CLEAN_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE348864F_CLEAN_sort_63b(n / 2, x); + PQCLEAN_MCELIECE348864F_CLEAN_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE348864F_CLEAN_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece348864f/clean/controlbits.h b/crypto_kem/mceliece348864f/clean/controlbits.h new file mode 100644 index 00000000..58f8ce61 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_CONTROLBITS_H +#define PQCLEAN_MCELIECE348864F_CLEAN_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE348864F_CLEAN_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE348864F_CLEAN_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece348864f/clean/crypto_hash.h b/crypto_kem/mceliece348864f/clean/crypto_hash.h new file mode 100644 index 00000000..fb719cfc --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE348864F_CLEAN_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece348864f/clean/decrypt.c b/crypto_kem/mceliece348864f/clean/decrypt.c new file mode 100644 index 00000000..f5e2765f --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/decrypt.c @@ -0,0 +1,90 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "gf.h" +#include "params.h" +#include "root.h" +#include "synd.h" +#include "util.h" + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE348864F_CLEAN_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i, w = 0; + uint16_t check; + + unsigned char r[ SYS_N / 8 ]; + + gf g[ SYS_T + 1 ]; + gf L[ SYS_N ]; + + gf s[ SYS_T * 2 ]; + gf s_cmp[ SYS_T * 2 ]; + gf locator[ SYS_T + 1 ]; + gf images[ SYS_N ]; + + gf t; + + // + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = c[i]; + } + for (i = SYND_BYTES; i < SYS_N / 8; i++) { + r[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE348864F_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + g[ SYS_T ] = 1; + + PQCLEAN_MCELIECE348864F_CLEAN_support_gen(L, sk); + + PQCLEAN_MCELIECE348864F_CLEAN_synd(s, g, L, r); + + PQCLEAN_MCELIECE348864F_CLEAN_bm(locator, s); + + PQCLEAN_MCELIECE348864F_CLEAN_root(images, locator, L); + + // + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + } + + for (i = 0; i < SYS_N; i++) { + t = PQCLEAN_MCELIECE348864F_CLEAN_gf_iszero(images[i]) & 1; + + e[ i / 8 ] |= t << (i % 8); + w += t; + + } + + PQCLEAN_MCELIECE348864F_CLEAN_synd(s_cmp, g, L, e); + + // + + check = (uint16_t)w; + check ^= SYS_T; + + for (i = 0; i < SYS_T * 2; i++) { + check |= s[i] ^ s_cmp[i]; + } + + check -= 1; + check >>= 15; + + return check ^ 1; +} + diff --git a/crypto_kem/mceliece348864f/clean/decrypt.h b/crypto_kem/mceliece348864f/clean/decrypt.h new file mode 100644 index 00000000..b3ac6462 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_DECRYPT_H +#define PQCLEAN_MCELIECE348864F_CLEAN_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE348864F_CLEAN_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/clean/encrypt.c b/crypto_kem/mceliece348864f/clean/encrypt.c new file mode 100644 index 00000000..19c6318f --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/encrypt.c @@ -0,0 +1,138 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include + +#include "gf.h" + +static inline uint8_t same_mask(uint16_t x, uint16_t y) { + uint32_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 31; + mask = -mask; + + return (uint8_t)mask; +} + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint8_t *ind_8 = (uint8_t *)ind_; + uint16_t ind[ SYS_T * 2 ]; + uint8_t mask; + unsigned char val[ SYS_T ]; + + while (1) { + randombytes(ind_8, sizeof(ind_)); + // Copy to uint16_t ind_ in a little-endian way + for (i = 0; i < sizeof(ind_); i += 2) { + ind_[i / 2] = ((uint16_t)ind_8[i + 1]) << 8 | (uint16_t)ind_8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = 1 << (ind[j] & 7); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = same_mask((uint16_t)i, (ind[j] >> 3)); + + e[i] |= val[j] & mask; + } + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + unsigned char b, row[SYS_N / 8]; + const unsigned char *pk_ptr = pk; + + int i, j; + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = 0; + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + row[j] = 0; + } + + for (j = 0; j < PK_ROW_BYTES; j++) { + row[ SYS_N / 8 - PK_ROW_BYTES + j ] = pk_ptr[j]; + } + + row[i / 8] |= 1 << (i % 8); + + b = 0; + for (j = 0; j < SYS_N / 8; j++) { + b ^= row[j] & e[j]; + } + + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] |= (b << (i % 8)); + + pk_ptr += PK_ROW_BYTES; + } +} + +void PQCLEAN_MCELIECE348864F_CLEAN_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece348864f/clean/encrypt.h b/crypto_kem/mceliece348864f/clean/encrypt.h new file mode 100644 index 00000000..91e10932 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_ENCRYPT_H +#define PQCLEAN_MCELIECE348864F_CLEAN_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE348864F_CLEAN_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/clean/gf.c b/crypto_kem/mceliece348864f/clean/gf.c new file mode 100644 index 00000000..36d27596 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/gf.c @@ -0,0 +1,139 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE348864F_CLEAN_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE348864F_CLEAN_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(gf in0, gf in1) { + int i; + + uint32_t tmp; + uint32_t t0; + uint32_t t1; + uint32_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & (1 << i))); + } + + t = tmp & 0x7FC000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + t = tmp & 0x3000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + return tmp & ((1 << GFBITS) - 1); +} + +/* input: field element in */ +/* return: in^2 */ +static inline gf gf_sq(gf in) { + const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; + + uint32_t x = in; + uint32_t t; + + x = (x | (x << 8)) & B[3]; + x = (x | (x << 4)) & B[2]; + x = (x | (x << 2)) & B[1]; + x = (x | (x << 1)) & B[0]; + + t = x & 0x7FC000; + x ^= t >> 9; + x ^= t >> 12; + + t = x & 0x3000; + x ^= t >> 9; + x ^= t >> 12; + + return x & ((1 << GFBITS) - 1); +} + +gf PQCLEAN_MCELIECE348864F_CLEAN_gf_inv(gf in) { + gf tmp_11; + gf tmp_1111; + + gf out = in; + + out = gf_sq(out); + tmp_11 = PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(out, in); // 11 + + out = gf_sq(tmp_11); + out = gf_sq(out); + tmp_1111 = PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(out, tmp_11); // 1111 + + out = gf_sq(tmp_1111); + out = gf_sq(out); + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(out, tmp_1111); // 11111111 + + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(out, tmp_11); // 1111111111 + + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(out, in); // 11111111111 + + return gf_sq(out); // 111111111110 +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE348864F_CLEAN_gf_frac(gf den, gf num) { + return PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(PQCLEAN_MCELIECE348864F_CLEAN_gf_inv(den), num); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE348864F_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(prod[i], (gf) 877); + prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(prod[i], (gf) 2888); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(prod[i], (gf) 1781); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(prod[i], (gf) 373); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece348864f/clean/gf.h b/crypto_kem/mceliece348864f/clean/gf.h new file mode 100644 index 00000000..6bdacca1 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_GF_H +#define PQCLEAN_MCELIECE348864F_CLEAN_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE348864F_CLEAN_gf_iszero(gf a); +gf PQCLEAN_MCELIECE348864F_CLEAN_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(gf in0, gf in1); +gf PQCLEAN_MCELIECE348864F_CLEAN_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE348864F_CLEAN_gf_inv(gf in); +uint64_t PQCLEAN_MCELIECE348864F_CLEAN_gf_mul2(gf a, gf b0, gf b1); + +void PQCLEAN_MCELIECE348864F_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece348864f/clean/operations.c b/crypto_kem/mceliece348864f/clean/operations.c new file mode 100644 index 00000000..58c34a20 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE348864F_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE348864F_CLEAN_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE348864F_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864F_CLEAN_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE348864F_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE348864F_CLEAN_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE348864F_CLEAN_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE348864F_CLEAN_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE348864F_CLEAN_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE348864F_CLEAN_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE348864F_CLEAN_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE348864F_CLEAN_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE348864F_CLEAN_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864f/clean/params.h b/crypto_kem/mceliece348864f/clean/params.h new file mode 100644 index 00000000..c736eb09 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_PARAMS_H +#define PQCLEAN_MCELIECE348864F_CLEAN_PARAMS_H + +#define GFBITS 12 +#define SYS_N 3488 +#define SYS_T 64 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece348864f/clean/pk_gen.c b/crypto_kem/mceliece348864f/clean/pk_gen.c new file mode 100644 index 00000000..46ffc856 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/pk_gen.c @@ -0,0 +1,294 @@ +/* + This file is for public-key generation +*/ + +#include +#include +#include +#include + +#include "controlbits.h" +#include "benes.h" +#include "params.h" +#include "pk_gen.h" +#include "root.h" +#include "util.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + int i, b, m = 0, r = 0; + + for (i = 0; i < 64; i++) { + b = (int)(in >> i) & 1; + m |= b; + r += (m ^ 1) & (b ^ 1); + } + + return r; +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint8_t mat[][ SYS_N / 8 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 8; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = PQCLEAN_MCELIECE348864F_CLEAN_load8( &mat[ row + i ][ block_idx ] ); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = PQCLEAN_MCELIECE348864F_CLEAN_load8( &mat[ i + j ][ block_idx ] ); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + PQCLEAN_MCELIECE348864F_CLEAN_store8( &mat[ i + j ][ block_idx ], buf[j] ); + } + } + + return 0; +} + +/* input: secret key sk */ +/* output: public key pk */ +int PQCLEAN_MCELIECE348864F_CLEAN_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) { + int i, j, k; + int row, c; + + uint64_t buf[ 1 << GFBITS ]; + + unsigned char mat[ GFBITS * SYS_T ][ SYS_N / 8 ]; + unsigned char mask; + unsigned char b; + + gf g[ SYS_T + 1 ]; // Goppa polynomial + gf L[ SYS_N ]; // support + gf inv[ SYS_N ]; + + // + + g[ SYS_T ] = 1; + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE348864F_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + + for (i = 0; i < (1 << GFBITS); i++) { + buf[i] = perm[i]; + buf[i] <<= 31; + buf[i] |= i; + } + + PQCLEAN_MCELIECE348864F_CLEAN_sort_63b(1 << GFBITS, buf); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = buf[i] & GFMASK; + } + for (i = 0; i < SYS_N; i++) { + L[i] = PQCLEAN_MCELIECE348864F_CLEAN_bitrev((gf)perm[i]); + } + + // filling the matrix + + PQCLEAN_MCELIECE348864F_CLEAN_root(inv, g, L); + + for (i = 0; i < SYS_N; i++) { + inv[i] = PQCLEAN_MCELIECE348864F_CLEAN_gf_inv(inv[i]); + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + mat[i][j] = 0; + } + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_N; j += 8) { + for (k = 0; k < GFBITS; k++) { + b = (inv[j + 7] >> k) & 1; + b <<= 1; + b |= (inv[j + 6] >> k) & 1; + b <<= 1; + b |= (inv[j + 5] >> k) & 1; + b <<= 1; + b |= (inv[j + 4] >> k) & 1; + b <<= 1; + b |= (inv[j + 3] >> k) & 1; + b <<= 1; + b |= (inv[j + 2] >> k) & 1; + b <<= 1; + b |= (inv[j + 1] >> k) & 1; + b <<= 1; + b |= (inv[j + 0] >> k) & 1; + + mat[ i * GFBITS + k ][ j / 8 ] = b; + } + } + + for (j = 0; j < SYS_N; j++) { + inv[j] = PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(inv[j], L[j]); + } + + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) { + for (j = 0; j < 8; j++) { + row = i * 8 + j; + + if (row >= GFBITS * SYS_T) { + break; + } + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] ^ mat[ k ][ i ]; + mask >>= j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < GFBITS * SYS_T; k++) { + if (k != row) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + } + + for (i = 0; i < PK_NROWS; i++) { + memcpy(pk + i * PK_ROW_BYTES, mat[i] + PK_NROWS / 8, PK_ROW_BYTES); + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864f/clean/pk_gen.h b/crypto_kem/mceliece348864f/clean/pk_gen.h new file mode 100644 index 00000000..8fee2ed4 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_PK_GEN_H +#define PQCLEAN_MCELIECE348864F_CLEAN_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE348864F_CLEAN_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/clean/root.c b/crypto_kem/mceliece348864f/clean/root.c new file mode 100644 index 00000000..462ea6b1 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/root.c @@ -0,0 +1,33 @@ +/* + This file is for evaluating a polynomial at one or more field elements +*/ +#include "root.h" + +#include "params.h" + +/* input: polynomial f and field element a */ +/* return f(a) */ +gf PQCLEAN_MCELIECE348864F_CLEAN_eval(gf *f, gf a) { + int i; + gf r; + + r = f[ SYS_T ]; + + for (i = SYS_T - 1; i >= 0; i--) { + r = PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(r, a); + r = PQCLEAN_MCELIECE348864F_CLEAN_gf_add(r, f[i]); + } + + return r; +} + +/* input: polynomial f and list of field elements L */ +/* output: out = [ f(a) for a in L ] */ +void PQCLEAN_MCELIECE348864F_CLEAN_root(gf *out, gf *f, gf *L) { + int i; + + for (i = 0; i < SYS_N; i++) { + out[i] = PQCLEAN_MCELIECE348864F_CLEAN_eval(f, L[i]); + } +} + diff --git a/crypto_kem/mceliece348864f/clean/root.h b/crypto_kem/mceliece348864f/clean/root.h new file mode 100644 index 00000000..6485f112 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/root.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_ROOT_H +#define PQCLEAN_MCELIECE348864F_CLEAN_ROOT_H +/* + This file is for evaluating a polynomial at one or more field elements +*/ + + +#include "gf.h" + +gf PQCLEAN_MCELIECE348864F_CLEAN_eval(gf * /*f*/, gf /*a*/); +void PQCLEAN_MCELIECE348864F_CLEAN_root(gf * /*out*/, gf * /*f*/, gf * /*L*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/clean/sk_gen.c b/crypto_kem/mceliece348864f/clean/sk_gen.c new file mode 100644 index 00000000..3a751e73 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE348864F_CLEAN_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE348864F_CLEAN_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE348864F_CLEAN_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE348864F_CLEAN_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE348864F_CLEAN_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE348864F_CLEAN_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864f/clean/sk_gen.h b/crypto_kem/mceliece348864f/clean/sk_gen.h new file mode 100644 index 00000000..6a08b6ab --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_SK_GEN_H +#define PQCLEAN_MCELIECE348864F_CLEAN_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE348864F_CLEAN_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE348864F_CLEAN_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/clean/synd.c b/crypto_kem/mceliece348864f/clean/synd.c new file mode 100644 index 00000000..6955e63d --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/synd.c @@ -0,0 +1,33 @@ +/* + This file is for syndrome computation +*/ + +#include "synd.h" + +#include "params.h" +#include "root.h" + + +/* input: Goppa polynomial f, support L, received word r */ +/* output: out, the syndrome of length 2t */ +void PQCLEAN_MCELIECE348864F_CLEAN_synd(gf *out, gf *f, gf *L, const unsigned char *r) { + int i, j; + gf e, e_inv, c; + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = 0; + } + + for (i = 0; i < SYS_N; i++) { + c = (r[i / 8] >> (i % 8)) & 1; + + e = PQCLEAN_MCELIECE348864F_CLEAN_eval(f, L[i]); + e_inv = PQCLEAN_MCELIECE348864F_CLEAN_gf_inv(PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(e, e)); + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = PQCLEAN_MCELIECE348864F_CLEAN_gf_add(out[j], PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(e_inv, c)); + e_inv = PQCLEAN_MCELIECE348864F_CLEAN_gf_mul(e_inv, L[i]); + } + } +} + diff --git a/crypto_kem/mceliece348864f/clean/synd.h b/crypto_kem/mceliece348864f/clean/synd.h new file mode 100644 index 00000000..a715b29a --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/synd.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_SYND_H +#define PQCLEAN_MCELIECE348864F_CLEAN_SYND_H +/* + This file is for syndrome computation +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE348864F_CLEAN_synd(gf * /*out*/, gf * /*f*/, gf * /*L*/, const unsigned char * /*r*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/clean/transpose.c b/crypto_kem/mceliece348864f/clean/transpose.c new file mode 100644 index 00000000..318c7e56 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/transpose.c @@ -0,0 +1,42 @@ +/* + This file is for matrix transposition +*/ + +#include "transpose.h" + +#include + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE348864F_CLEAN_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + diff --git a/crypto_kem/mceliece348864f/clean/transpose.h b/crypto_kem/mceliece348864f/clean/transpose.h new file mode 100644 index 00000000..7b5588cc --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/transpose.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_TRANSPOSE_H +#define PQCLEAN_MCELIECE348864F_CLEAN_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + +void PQCLEAN_MCELIECE348864F_CLEAN_transpose_64x64(uint64_t * /*out*/, const uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/clean/util.c b/crypto_kem/mceliece348864f/clean/util.c new file mode 100644 index 00000000..5be8fe27 --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/util.c @@ -0,0 +1,67 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ + +#include "util.h" + +#include "params.h" + +void PQCLEAN_MCELIECE348864F_CLEAN_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE348864F_CLEAN_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE348864F_CLEAN_load4(const unsigned char *in) { + int i; + uint32_t ret = in[3]; + + for (i = 2; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +void PQCLEAN_MCELIECE348864F_CLEAN_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE348864F_CLEAN_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE348864F_CLEAN_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 4; +} + diff --git a/crypto_kem/mceliece348864f/clean/util.h b/crypto_kem/mceliece348864f/clean/util.h new file mode 100644 index 00000000..710433ae --- /dev/null +++ b/crypto_kem/mceliece348864f/clean/util.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE348864F_CLEAN_UTIL_H +#define PQCLEAN_MCELIECE348864F_CLEAN_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include + +void PQCLEAN_MCELIECE348864F_CLEAN_store2(unsigned char * /*dest*/, gf /*a*/); +uint16_t PQCLEAN_MCELIECE348864F_CLEAN_load2(const unsigned char * /*src*/); + +uint32_t PQCLEAN_MCELIECE348864F_CLEAN_load4(const unsigned char * /*in*/); + +void PQCLEAN_MCELIECE348864F_CLEAN_store8(unsigned char * /*out*/, uint64_t /*in*/); +uint64_t PQCLEAN_MCELIECE348864F_CLEAN_load8(const unsigned char * /*in*/); + +gf PQCLEAN_MCELIECE348864F_CLEAN_bitrev(gf /*a*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/LICENSE b/crypto_kem/mceliece348864f/sse/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece348864f/sse/Makefile b/crypto_kem/mceliece348864f/sse/Makefile new file mode 100644 index 00000000..5a5e9f64 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/Makefile @@ -0,0 +1,41 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece348864f_sse.a + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c operations.c pk_gen.c sk_gen.c transpose.c util.c \ + vec.c vec128.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S \ + transpose_64x64_asm.S update_asm.S vec128_mul_asm.S \ + vec_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h params.h \ + pk_gen.h sk_gen.h transpose.h util.h vec128.h vec.h \ + consts.inc powers.inc scalars_2x.inc scalars.inc + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o operations.o pk_gen.o sk_gen.o transpose.o util.o \ + vec.o vec128.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + transpose_64x64_asm.o update_asm.o vec128_mul_asm.o \ + vec_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mpopcnt -mbmi -msse4.1 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece348864f/sse/aes256ctr.c b/crypto_kem/mceliece348864f/sse/aes256ctr.c new file mode 100644 index 00000000..b3fa41f7 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE348864F_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece348864f/sse/aes256ctr.h b/crypto_kem/mceliece348864f/sse/aes256ctr.h new file mode 100644 index 00000000..7cff2420 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_AES256CTR_H +#define PQCLEAN_MCELIECE348864F_SSE_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE348864F_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece348864f/sse/api.h b/crypto_kem/mceliece348864f/sse/api.h new file mode 100644 index 00000000..54c39c3e --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/api.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_API_H +#define PQCLEAN_MCELIECE348864F_SSE_API_H + +#include + +#define PQCLEAN_MCELIECE348864F_SSE_CRYPTO_ALGNAME "Classic McEliece 348864f" +#define PQCLEAN_MCELIECE348864F_SSE_CRYPTO_PUBLICKEYBYTES 261120 +#define PQCLEAN_MCELIECE348864F_SSE_CRYPTO_SECRETKEYBYTES 6452 +#define PQCLEAN_MCELIECE348864F_SSE_CRYPTO_CIPHERTEXTBYTES 128 +#define PQCLEAN_MCELIECE348864F_SSE_CRYPTO_BYTES 32 + + + +int PQCLEAN_MCELIECE348864F_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE348864F_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE348864F_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/benes.c b/crypto_kem/mceliece348864f/sse/benes.c new file mode 100644 index 00000000..c2b12658 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/benes.c @@ -0,0 +1,287 @@ +/* + This file is for Benes network related functions +*/ +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_0(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = bs[ x ] ^ bs[ x + 1 ]; + diff &= *cond++; + bs[ x ] ^= diff; + bs[ x + 1 ] ^= diff; + } +} + +static void layer_1(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = bs[ x + 0 ] ^ bs[ x + 2 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 2 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 3 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 3 ] ^= diff; + + cond += 2; + } +} + +static void layer_2(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = bs[ x + 0 ] ^ bs[ x + 4 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 4 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 5 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 5 ] ^= diff; + + diff = bs[ x + 2 ] ^ bs[ x + 6 ]; + diff &= cond[2]; + bs[ x + 2 ] ^= diff; + bs[ x + 6 ] ^= diff; + + diff = bs[ x + 3 ] ^ bs[ x + 7 ]; + diff &= cond[3]; + bs[ x + 3 ] ^= diff; + bs[ x + 7 ] ^= diff; + + cond += 4; + } +} + +static void layer_3(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 8 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 8 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 9 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 9 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 10 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 10 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 11 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 11 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_4(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 16 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 16 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 17 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 17 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 18 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 18 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 19 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 19 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_5(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 32 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 32 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 33 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 33 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 34 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 34 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 35 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 35 ] ^= diff; + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: out, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE348864F_SSE_load_bits(uint64_t out[][32], const unsigned char *bits) { + int i, low, block = 0; + + uint64_t cond[64]; + + // + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_SSE_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864F_SSE_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864F_SSE_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 4; low >= 0; low--) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864F_SSE_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 5; low >= 0; low--) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_SSE_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864F_SSE_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } +} + +/* input: r, sequence of bits to be permuted */ +/* cond, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE348864F_SSE_benes(uint64_t *r, uint64_t cond[][32], int rev) { + int block, inc; + + uint64_t *bs = r; + + // + + if (rev == 0) { + block = 0; + inc = 1; + } else { + block = 22; + inc = -1; + } + + PQCLEAN_MCELIECE348864F_SSE_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864F_SSE_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864F_SSE_transpose_64x64(bs); + + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + //block += inc; + + PQCLEAN_MCELIECE348864F_SSE_transpose_64x64(bs); +} + diff --git a/crypto_kem/mceliece348864f/sse/benes.h b/crypto_kem/mceliece348864f/sse/benes.h new file mode 100644 index 00000000..1148f08f --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/benes.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_BENES_H +#define PQCLEAN_MCELIECE348864F_SSE_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "gf.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864F_SSE_load_bits(uint64_t /*out*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE348864F_SSE_benes(uint64_t * /*r*/, uint64_t /*cond*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/bm.c b/crypto_kem/mceliece348864f/sse/bm.c new file mode 100644 index 00000000..1eb846f3 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/bm.c @@ -0,0 +1,220 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "util.h" +#include "vec.h" +#include "vec128.h" + +#include +#include + +extern void PQCLEAN_MCELIECE348864F_SSE_update_asm(void *, gf, int); +extern gf PQCLEAN_MCELIECE348864F_SSE_vec_reduce_asm(uint64_t *); + +static inline uint64_t mask_nonzero(gf a) { + uint64_t ret = a; + + ret -= 1; + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline uint64_t mask_leq(uint16_t a, uint16_t b) { + uint64_t a_tmp = a; + uint64_t b_tmp = b; + uint64_t ret = b_tmp - a_tmp; + + ret >>= 63; + ret -= 1; + + return ret; +} + +static void vec_cmov(uint64_t out[][2], uint64_t mask) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i][0] = (out[i][0] & ~mask) | (out[i][1] & mask); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE348864F_SSE_vec128_or(PQCLEAN_MCELIECE348864F_SSE_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE348864F_SSE_vec128_sll_2x(PQCLEAN_MCELIECE348864F_SSE_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE348864F_SSE_vec128_or(PQCLEAN_MCELIECE348864F_SSE_vec128_srl_2x(PQCLEAN_MCELIECE348864F_SSE_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE348864F_SSE_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < GFBITS; i++) { + buf[i] = in[i]; + } + for (i = GFBITS; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE348864F_SSE_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE348864F_SSE_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE348864F_SSE_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE348864F_SSE_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE348864F_SSE_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE348864F_SSE_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE348864F_SSE_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE348864F_SSE_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE348864F_SSE_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864F_SSE_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864F_SSE_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE348864F_SSE_bm(uint64_t out[ GFBITS ], vec128 in[ GFBITS ]) { + uint16_t i; + uint16_t N, L; + + uint64_t prod[ GFBITS ]; + uint64_t in_tmp[ GFBITS ]; + + uint64_t db[ GFBITS ][ 2 ]; + uint64_t BC_tmp[ GFBITS ][ 2 ]; + uint64_t BC[ GFBITS ][ 2 ]; + + uint64_t mask, t; + + gf d, b, c0 = 1; + + gf coefs[SYS_T * 2]; + + // init + + BC[0][1] = 0; + BC[0][0] = 1; + BC[0][0] <<= 63; + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = 0; + } + + b = 1; + L = 0; + + // + + get_coefs(coefs, in); + + for (i = 0; i < GFBITS; i++) { + in_tmp[i] = 0; + } + + for (N = 0; N < SYS_T * 2; N++) { + // computing d + + PQCLEAN_MCELIECE348864F_SSE_vec_mul_asm(prod, in_tmp, &BC[0][1], 16); + + PQCLEAN_MCELIECE348864F_SSE_update_asm(in_tmp, coefs[N], 8); + + d = PQCLEAN_MCELIECE348864F_SSE_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE348864F_SSE_gf_mul2(c0, coefs[N], b); + + d ^= t & 0xFFFFFFFF; + + // 3 cases + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = (d >> i) & 1; + db[i][0] = -db[i][0]; + db[i][1] = (b >> i) & 1; + db[i][1] = -db[i][1]; + } + + PQCLEAN_MCELIECE348864F_SSE_vec128_mul((vec128 *) BC_tmp, (vec128 *) db, (vec128 *) BC); + + vec_cmov(BC, mask); + + PQCLEAN_MCELIECE348864F_SSE_update_asm(BC, mask & c0, 16); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = BC_tmp[i][0] ^ BC_tmp[i][1]; + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + + } + + c0 = PQCLEAN_MCELIECE348864F_SSE_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + out[i] = (c0 >> i) & 1; + out[i] = -out[i]; + } + + PQCLEAN_MCELIECE348864F_SSE_vec_mul_asm(out, out, &BC[0][1], 16); +} + diff --git a/crypto_kem/mceliece348864f/sse/bm.h b/crypto_kem/mceliece348864f/sse/bm.h new file mode 100644 index 00000000..2862c804 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/bm.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_BM_H +#define PQCLEAN_MCELIECE348864F_SSE_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864F_SSE_bm(uint64_t *out, vec128 *in); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/consts.S b/crypto_kem/mceliece348864f/sse/consts.S new file mode 100644 index 00000000..5e9d77a8 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/consts.S @@ -0,0 +1,32 @@ +.data + +# not supported on macos +#.section .rodata +.globl PQCLEAN_MCELIECE348864F_SSE_MASK0_0 +.globl PQCLEAN_MCELIECE348864F_SSE_MASK0_1 +.globl PQCLEAN_MCELIECE348864F_SSE_MASK1_0 +.globl PQCLEAN_MCELIECE348864F_SSE_MASK1_1 +.globl PQCLEAN_MCELIECE348864F_SSE_MASK2_0 +.globl PQCLEAN_MCELIECE348864F_SSE_MASK2_1 +.globl PQCLEAN_MCELIECE348864F_SSE_MASK3_0 +.globl PQCLEAN_MCELIECE348864F_SSE_MASK3_1 +.globl PQCLEAN_MCELIECE348864F_SSE_MASK4_0 +.globl PQCLEAN_MCELIECE348864F_SSE_MASK4_1 +.globl PQCLEAN_MCELIECE348864F_SSE_MASK5_0 +.globl PQCLEAN_MCELIECE348864F_SSE_MASK5_1 + +.p2align 4 + +PQCLEAN_MCELIECE348864F_SSE_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE348864F_SSE_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE348864F_SSE_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE348864F_SSE_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE348864F_SSE_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE348864F_SSE_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE348864F_SSE_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE348864F_SSE_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE348864F_SSE_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE348864F_SSE_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE348864F_SSE_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE348864F_SSE_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece348864f/sse/consts.inc b/crypto_kem/mceliece348864f/sse/consts.inc new file mode 100644 index 00000000..4ed2a081 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/consts.inc @@ -0,0 +1,448 @@ +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x33333333CCCCCCCC, 0x33333333CCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF0000FF00FFFF00, 0xFF0000FF00FFFF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC33333333CCCC, 0x3333CCCCCCCC3333), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x33CC33CCCC33CC33, 0x33CC33CCCC33CC33), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FFFF00FF0000FF, 0x00FFFF00FF0000FF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCCCCCC3333, 0xCCCC33333333CCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, diff --git a/crypto_kem/mceliece348864f/sse/controlbits.c b/crypto_kem/mceliece348864f/sse/controlbits.c new file mode 100644 index 00000000..5eeccdd4 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE348864F_SSE_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE348864F_SSE_sort_63b(n / 2, x); + PQCLEAN_MCELIECE348864F_SSE_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE348864F_SSE_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece348864f/sse/controlbits.h b/crypto_kem/mceliece348864f/sse/controlbits.h new file mode 100644 index 00000000..769b184e --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_CONTROLBITS_H +#define PQCLEAN_MCELIECE348864F_SSE_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE348864F_SSE_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE348864F_SSE_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/crypto_hash.h b/crypto_kem/mceliece348864f/sse/crypto_hash.h new file mode 100644 index 00000000..1a95ccb5 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE348864F_SSE_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece348864f/sse/decrypt.c b/crypto_kem/mceliece348864f/sse/decrypt.c new file mode 100644 index 00000000..22e7522f --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/decrypt.c @@ -0,0 +1,203 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec128 out[][GFBITS], vec128 inv[][GFBITS], const unsigned char *sk, vec128 *recv) { + int i, j; + + uint64_t irr_int[ GFBITS ]; + vec128 eval[32][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE348864F_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE348864F_SSE_fft(eval, irr_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE348864F_SSE_vec128_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE348864F_SSE_vec128_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864F_SSE_vec128_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864F_SSE_vec128_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864F_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + uint8_t r[ 512 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 512; i++) { + r[i] = 0; + } + + for (i = 0; i < 32; i++) { + recv[i] = PQCLEAN_MCELIECE348864F_SSE_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE348864F_SSE_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE348864F_SSE_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec128 out[][GFBITS], vec128 inv[][GFBITS], vec128 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864F_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 32; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864F_SSE_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864F_SSE_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u32( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint64_t synd_cmp(vec128 s0[ GFBITS ], vec128 s1[ GFBITS ]) { + int i; + vec128 diff; + + diff = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE348864F_SSE_vec128_or(diff, PQCLEAN_MCELIECE348864F_SSE_vec128_xor(s0[i], s1[i])); + } + + return PQCLEAN_MCELIECE348864F_SSE_vec128_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE348864F_SSE_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec128 inv[ 32 ][ GFBITS ]; + vec128 scaled[ 32 ][ GFBITS ]; + vec128 eval[ 32 ][ GFBITS ]; + + vec128 error[ 32 ]; + + vec128 s_priv[ GFBITS ]; + vec128 s_priv_cmp[ GFBITS ]; + + uint64_t locator[ GFBITS ]; + + vec128 recv[ 32 ]; + vec128 allone; + + uint64_t bits_int[23][32]; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE348864F_SSE_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE348864F_SSE_benes((uint64_t *) recv, bits_int, 1); + + scaling(scaled, inv, sk, recv); + + PQCLEAN_MCELIECE348864F_SSE_fft_tr(s_priv, scaled); + + PQCLEAN_MCELIECE348864F_SSE_bm(locator, s_priv); + + PQCLEAN_MCELIECE348864F_SSE_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE348864F_SSE_vec128_setbits(1); + + for (i = 0; i < 32; i++) { + error[i] = PQCLEAN_MCELIECE348864F_SSE_vec128_or_reduce(eval[i]); + error[i] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(error[i], allone); + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE348864F_SSE_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE348864F_SSE_benes((uint64_t *) error, bits_int, 0); + + postprocess(e, error); + + check_weight = weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece348864f/sse/decrypt.h b/crypto_kem/mceliece348864f/sse/decrypt.h new file mode 100644 index 00000000..d1158693 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_DECRYPT_H +#define PQCLEAN_MCELIECE348864F_SSE_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE348864F_SSE_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/encrypt.c b/crypto_kem/mceliece348864f/sse/encrypt.c new file mode 100644 index 00000000..1bd05824 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/encrypt.c @@ -0,0 +1,99 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "gf.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE348864F_SSE_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint16_t ind[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *)ind_, sizeof(ind_)); + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE348864F_SSE_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864F_SSE_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE348864F_SSE_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece348864f/sse/encrypt.h b/crypto_kem/mceliece348864f/sse/encrypt.h new file mode 100644 index 00000000..0cb745ec --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_ENCRYPT_H +#define PQCLEAN_MCELIECE348864F_SSE_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE348864F_SSE_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/fft.c b/crypto_kem/mceliece348864f/sse/fft.c new file mode 100644 index 00000000..ee16dedf --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/fft.c @@ -0,0 +1,155 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "vec.h" +#include "vec128.h" + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(uint64_t *in) { + int i, j, k; + + const uint64_t mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const uint64_t s[5][GFBITS] = { +#include "scalars.inc" + }; + + // + + for (j = 0; j <= 4; j++) { + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[i] ^= (in[i] & mask[k][0]) >> (1 << k); + in[i] ^= (in[i] & mask[k][1]) >> (1 << k); + } + } + + PQCLEAN_MCELIECE348864F_SSE_vec_mul(in, in, s[j]); // scaling + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec128 out[][ GFBITS ], const uint64_t *in) { + int i, j, k, s, b; + + uint64_t t0, t1; + + const vec128 consts[ 32 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 0; + + const uint8_t reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + // boradcast + + vec128 tmp[ GFBITS ]; + vec128 x[ GFBITS ], y[ GFBITS ]; + + for (j = 0; j < 64; j += 4) { + for (i = 0; i < GFBITS; i++) { + t0 = (in[i] >> reversal[j + 0]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 2]) & 1; + t1 = -t1; + + out[j / 2 + 0][i] = PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(t0, t1); + + t0 = (in[i] >> reversal[j + 1]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 3]) & 1; + t1 = -t1; + + out[j / 2 + 1][i] = PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(t0, t1); + } + } + + // + + + for (i = 0; i < 32; i += 2) { + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(tmp, out[i + 1], consts[ 0 ]); + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] ^= out[i + 0][b]; + } + + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864F_SSE_vec128_unpack_low(out[i + 0][b], out[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864F_SSE_vec128_unpack_high(out[i + 0][b], out[i + 1][b]); + } + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] = x[b]; + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] = y[b]; + } + } + + consts_ptr += 1; + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += s; + } + + // adding the part contributed by x^64 + + vec128 powers[32][GFBITS] = { +#include "powers.inc" + }; + + for (i = 0; i < 32; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] ^= powers[i][b]; + } + } +} + +void PQCLEAN_MCELIECE348864F_SSE_fft(vec128 out[][ GFBITS ], uint64_t *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece348864f/sse/fft.h b/crypto_kem/mceliece348864f/sse/fft.h new file mode 100644 index 00000000..48c99619 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_FFT_H +#define PQCLEAN_MCELIECE348864F_SSE_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864F_SSE_fft(vec128 /*out*/[][GFBITS], uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/fft_tr.c b/crypto_kem/mceliece348864f/sse/fft_tr.c new file mode 100644 index 00000000..b4625c0c --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/fft_tr.c @@ -0,0 +1,312 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" +#include "vec.h" +#include "vec128.h" + +#include + +static void radix_conversions_tr(vec128 in[ GFBITS ]) { + int i, j, k; + + const vec128 mask[10] = { + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + uint64_t v0, v1; + + // + + for (j = 5; j >= 0; j--) { + + if (j < 5) { + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(in, in, s[j]); + } + + for (i = 0; i < GFBITS; i++) { + for (k = j; k <= 4; k++) { + in[i] ^= PQCLEAN_MCELIECE348864F_SSE_vec128_sll_2x(in[i] & mask[2 * k + 0], 1 << k); + in[i] ^= PQCLEAN_MCELIECE348864F_SSE_vec128_sll_2x(in[i] & mask[2 * k + 1], 1 << k); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in[i], 0); + v1 = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in[i], 1); + + v1 ^= v0 >> 32; + v1 ^= v1 << 32; + + in[i] = PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(v0, v1); + } + } +} + +static void butterflies_tr(vec128 out[ GFBITS ], vec128 in[][ GFBITS ]) { + int i, j, k, s, b; + + uint64_t t[ GFBITS ]; + uint64_t pre[6][ GFBITS ]; + + uint64_t out64[2][GFBITS]; + + vec128 p2[ 6 ]; + vec128 buf[64]; + vec128 tt[ GFBITS ]; + vec128 x[ GFBITS ], y[ GFBITS ]; + + const vec128 consts[ 32 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 32; + + const uint8_t reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {8, 1300, 3408, 1354, 2341, 1154}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(tt, in[k], consts[ consts_ptr + (k - j) ]); + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tt[b]; + } + } + } + } + + for (i = 0; i < 32; i += 2) { + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864F_SSE_vec128_unpack_low(in[i + 0][b], in[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864F_SSE_vec128_unpack_high(in[i + 0][b], in[i + 1][b]); + } + + for (b = 0; b < GFBITS; b++) { + in[i + 0][b] = x[b] ^ y[b]; + } + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(tt, in[i + 0], consts[ 0 ]); + for (b = 0; b < GFBITS; b++) { + in[i + 1][b] = y[b] ^ tt[b]; + } + } + + // transpose + + for (i = 0; i < GFBITS; i += 2) { + for (j = 0; j < 64; j += 4) { + buf[ reversal[j + 0] ] = PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in[j / 2 + 0][i + 0], 0), + PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in[j / 2 + 0][i + 1], 0)); + buf[ reversal[j + 1] ] = PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in[j / 2 + 1][i + 0], 0), + PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in[j / 2 + 1][i + 1], 0)); + buf[ reversal[j + 2] ] = PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in[j / 2 + 0][i + 0], 1), + PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in[j / 2 + 0][i + 1], 1)); + buf[ reversal[j + 3] ] = PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in[j / 2 + 1][i + 0], 1), + PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in[j / 2 + 1][i + 1], 1)); + } + + PQCLEAN_MCELIECE348864F_SSE_transpose_64x128_sp(buf); + + p2[0] = buf[32]; + buf[33] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[33], buf[32]); + p2[1] = buf[33]; + buf[35] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[35], buf[33]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[35]); + buf[34] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[34], buf[35]); + p2[2] = buf[34]; + buf[38] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[38], buf[34]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[38]); + buf[39] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[39], buf[38]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[39]); + buf[37] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[37], buf[39]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[37]); + buf[36] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[36], buf[37]); + p2[3] = buf[36]; + buf[44] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[44], buf[36]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[44]); + buf[45] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[45], buf[44]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[45]); + buf[47] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[47], buf[45]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[47]); + buf[46] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[46], buf[47]); + p2[2] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[2], buf[46]); + buf[42] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[42], buf[46]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[42]); + buf[43] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[43], buf[42]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[43]); + buf[41] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[41], buf[43]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[41]); + buf[40] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[40], buf[41]); + p2[4] = buf[40]; + buf[56] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[56], buf[40]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[56]); + buf[57] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[57], buf[56]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[57]); + buf[59] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[59], buf[57]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[59]); + buf[58] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[58], buf[59]); + p2[2] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[2], buf[58]); + buf[62] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[62], buf[58]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[62]); + buf[63] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[63], buf[62]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[63]); + buf[61] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[61], buf[63]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[61]); + buf[60] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[60], buf[61]); + p2[3] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[3], buf[60]); + buf[52] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[52], buf[60]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[52]); + buf[53] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[53], buf[52]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[53]); + buf[55] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[55], buf[53]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[55]); + buf[54] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[54], buf[55]); + p2[2] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[2], buf[54]); + buf[50] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[50], buf[54]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[50]); + buf[51] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[51], buf[50]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[51]); + buf[49] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[49], buf[51]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[49]); + buf[48] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[48], buf[49]); + p2[5] = buf[48]; + buf[16] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[16], buf[48]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[16]); + buf[17] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[17], buf[16]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[17]); + buf[19] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[19], buf[17]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[19]); + buf[18] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[18], buf[19]); + p2[2] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[2], buf[18]); + buf[22] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[22], buf[18]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[22]); + buf[23] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[23], buf[22]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[23]); + buf[21] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[21], buf[23]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[21]); + buf[20] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[20], buf[21]); + p2[3] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[3], buf[20]); + buf[28] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[28], buf[20]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[28]); + buf[29] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[29], buf[28]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[29]); + buf[31] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[31], buf[29]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[31]); + buf[30] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[30], buf[31]); + p2[2] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[2], buf[30]); + buf[26] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[26], buf[30]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[26]); + buf[27] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[27], buf[26]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[27]); + buf[25] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[25], buf[27]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[25]); + buf[24] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[24], buf[25]); + p2[4] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[4], buf[24]); + buf[8] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[8], buf[24]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[8]); + buf[9] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[9], buf[8]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[9]); + buf[11] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[11], buf[9]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[11]); + buf[10] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[10], buf[11]); + p2[2] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[2], buf[10]); + buf[14] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[14], buf[10]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[14]); + buf[15] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[15], buf[14]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[15]); + buf[13] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[13], buf[15]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[13]); + buf[12] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[12], buf[13]); + p2[3] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[3], buf[12]); + buf[4] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[4], buf[12]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[4]); + buf[5] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[5], buf[4]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[5]); + buf[7] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[7], buf[5]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[7]); + buf[6] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[6], buf[7]); + p2[2] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[2], buf[6]); + buf[2] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[2], buf[6]); + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[2]); + buf[3] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[3], buf[2]); + p2[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[1], buf[3]); + buf[1] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[1], buf[3]); + + p2[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(p2[0], buf[1]); + buf[0] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(buf[0], buf[1]); + + for (j = 0; j < 6; j++) { + pre[j][i + 0] = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(p2[j], 0); + pre[j][i + 1] = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(p2[j], 1); + } + + out64[0][i + 0] = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(buf[0], 0); + out64[0][i + 1] = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(buf[0], 1); + } + + // + + for (j = 0; j < GFBITS; j++) { + t[j] = (beta[0] >> j) & 1; + t[j] = -t[j]; + } + + PQCLEAN_MCELIECE348864F_SSE_vec_mul(out64[1], pre[0], t); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + t[j] = (beta[i] >> j) & 1; + t[j] = -t[j]; + } + + PQCLEAN_MCELIECE348864F_SSE_vec_mul(t, pre[i], t); + PQCLEAN_MCELIECE348864F_SSE_vec_add(out64[1], out64[1], t); + } + + for (b = 0; b < GFBITS; b++) { + out[b] = PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(out64[0][b], out64[1][b]); + } +} + +void PQCLEAN_MCELIECE348864F_SSE_fft_tr(vec128 out[GFBITS], vec128 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece348864f/sse/fft_tr.h b/crypto_kem/mceliece348864f/sse/fft_tr.h new file mode 100644 index 00000000..12f2a8e7 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/fft_tr.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_FFT_TR_H +#define PQCLEAN_MCELIECE348864F_SSE_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864F_SSE_fft_tr(vec128 /*out*/[GFBITS], vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/gf.c b/crypto_kem/mceliece348864f/sse/gf.c new file mode 100644 index 00000000..bce89ef6 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/gf.c @@ -0,0 +1,169 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE348864F_SSE_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 20; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE348864F_SSE_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE348864F_SSE_gf_mul(gf in0, gf in1) { + int i; + + uint32_t tmp; + uint32_t t0; + uint32_t t1; + uint32_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & (1 << i))); + } + + t = tmp & 0x7FC000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + t = tmp & 0x3000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + return tmp & ((1 << GFBITS) - 1); +} + +/* input: field element in */ +/* return: in^2 */ +static inline gf gf_sq(gf in) { + const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; + + uint32_t x = in; + uint32_t t; + + x = (x | (x << 8)) & B[3]; + x = (x | (x << 4)) & B[2]; + x = (x | (x << 2)) & B[1]; + x = (x | (x << 1)) & B[0]; + + t = x & 0x7FC000; + x ^= t >> 9; + x ^= t >> 12; + + t = x & 0x3000; + x ^= t >> 9; + x ^= t >> 12; + + return x & ((1 << GFBITS) - 1); +} + +gf PQCLEAN_MCELIECE348864F_SSE_gf_inv(gf in) { + gf tmp_11; + gf tmp_1111; + + gf out = in; + + out = gf_sq(out); + tmp_11 = PQCLEAN_MCELIECE348864F_SSE_gf_mul(out, in); // 11 + + out = gf_sq(tmp_11); + out = gf_sq(out); + tmp_1111 = PQCLEAN_MCELIECE348864F_SSE_gf_mul(out, tmp_11); // 1111 + + out = gf_sq(tmp_1111); + out = gf_sq(out); + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_SSE_gf_mul(out, tmp_1111); // 11111111 + + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_SSE_gf_mul(out, tmp_11); // 1111111111 + + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_SSE_gf_mul(out, in); // 11111111111 + + return gf_sq(out); // 111111111110 +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE348864F_SSE_gf_frac(gf den, gf num) { + return PQCLEAN_MCELIECE348864F_SSE_gf_mul(PQCLEAN_MCELIECE348864F_SSE_gf_inv(den), num); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE348864F_SSE_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE348864F_SSE_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864F_SSE_gf_mul(prod[i], (gf) 877); + prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864F_SSE_gf_mul(prod[i], (gf) 2888); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864F_SSE_gf_mul(prod[i], (gf) 1781); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864F_SSE_gf_mul(prod[i], (gf) 373); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864F_SSE_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x007FC000007FC000; + tmp ^= (t >> 9) ^ (t >> 12); + + t = tmp & 0x0000300000003000; + tmp ^= (t >> 9) ^ (t >> 12); + + return tmp & 0x00000FFF00000FFF; +} + diff --git a/crypto_kem/mceliece348864f/sse/gf.h b/crypto_kem/mceliece348864f/sse/gf.h new file mode 100644 index 00000000..f98c4c57 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/gf.h @@ -0,0 +1,26 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_GF_H +#define PQCLEAN_MCELIECE348864F_SSE_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE348864F_SSE_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE348864F_SSE_gf_add(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864F_SSE_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864F_SSE_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE348864F_SSE_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE348864F_SSE_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864F_SSE_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/operations.c b/crypto_kem/mceliece348864f/sse/operations.c new file mode 100644 index 00000000..ecfe628b --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE348864F_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE348864F_SSE_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE348864F_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864F_SSE_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE348864F_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE348864F_SSE_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE348864F_SSE_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE348864F_SSE_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE348864F_SSE_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE348864F_SSE_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE348864F_SSE_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE348864F_SSE_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE348864F_SSE_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864f/sse/params.h b/crypto_kem/mceliece348864f/sse/params.h new file mode 100644 index 00000000..578b7328 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_PARAMS_H +#define PQCLEAN_MCELIECE348864F_SSE_PARAMS_H + +#define GFBITS 12 +#define SYS_N 3488 +#define SYS_T 64 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/pk_gen.c b/crypto_kem/mceliece348864f/sse/pk_gen.c new file mode 100644 index 00000000..79a8db88 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/pk_gen.c @@ -0,0 +1,329 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +#include + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec128 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 128 + 0 * 64 + r] <<= 1; + out[i * 128 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 128 + 1 * 64 + r] <<= 1; + out[i * 128 + 1 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec128 out0[][GFBITS], vec128 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[2] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(u[0], u[1]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(u[0], u[1]); + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 127) / 128) * 2 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + PQCLEAN_MCELIECE348864F_SSE_transpose_64x64(buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + PQCLEAN_MCELIECE348864F_SSE_transpose_64x64(buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << 32 >> 32) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> 32 << 32) | (buf[j] >> 32); + } + } + + return 0; +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 127) / 128) +#define NBLOCKS_I ((GFBITS * SYS_T + 63) / 64) + +int PQCLEAN_MCELIECE348864F_SSE_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 2 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS_I ]; + + uint64_t mask; + + uint64_t irr_int[ GFBITS ]; + + vec128 consts[32][ GFBITS ]; + vec128 eval[ 32 ][ GFBITS ]; + vec128 prod[ 32 ][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE348864F_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE348864F_SSE_fft(eval, irr_int); + + PQCLEAN_MCELIECE348864F_SSE_vec128_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864F_SSE_vec128_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864F_SSE_vec128_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE348864F_SSE_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE348864F_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + // gaussian elimination + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = NBLOCKS_I; j < NBLOCKS1_H - 1; j++) { + PQCLEAN_MCELIECE348864F_SSE_store8(pk, mat[i][j]); + pk += 8; + } + + PQCLEAN_MCELIECE348864F_SSE_store_i(pk, mat[i][j], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece348864f/sse/pk_gen.h b/crypto_kem/mceliece348864f/sse/pk_gen.h new file mode 100644 index 00000000..4622954f --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_PK_GEN_H +#define PQCLEAN_MCELIECE348864F_SSE_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include "gf.h" + +int PQCLEAN_MCELIECE348864F_SSE_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/powers.inc b/crypto_kem/mceliece348864f/sse/powers.inc new file mode 100644 index 00000000..f48a95b6 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/powers.inc @@ -0,0 +1,448 @@ +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, diff --git a/crypto_kem/mceliece348864f/sse/scalars.inc b/crypto_kem/mceliece348864f/sse/scalars.inc new file mode 100644 index 00000000..aa8f64b9 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/scalars.inc @@ -0,0 +1,70 @@ +{ + 0XF3CFC030FC30F003, + 0X3FCF0F003C00C00C, + 0X30033CC300C0C03C, + 0XCCFF0F3C0F30F0C0, + 0X0300C03FF303C3F0, + 0X3FFF3C0FF0CCCCC0, + 0XF3FFF0C00F3C3CC0, + 0X3003333FFFC3C000, + 0X0FF30FFFC3FFF300, + 0XFFC0F300F0F0CC00, + 0XC0CFF3FCCC3CFC00, + 0XFC3C03F0F330C000, +}, +{ + 0X000F00000000F00F, + 0X00000F00F00000F0, + 0X0F00000F00000F00, + 0XF00F00F00F000000, + 0X00F00000000000F0, + 0X0000000F00000000, + 0XF00000000F00F000, + 0X00F00F00000F0000, + 0X0000F00000F00F00, + 0X000F00F00F00F000, + 0X00F00F0000000000, + 0X0000000000F00000, +}, +{ + 0X0000FF00FF0000FF, + 0X0000FF000000FF00, + 0XFF0000FF00FF0000, + 0XFFFF0000FF000000, + 0X00FF00FF00FF0000, + 0X0000FFFFFF000000, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF0000, + 0XFFFF00FFFF00FF00, + 0X0000FF0000000000, + 0XFFFFFF00FF000000, + 0X00FF000000000000, +}, +{ + 0X000000000000FFFF, + 0X00000000FFFF0000, + 0X0000000000000000, + 0XFFFF000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +}, +{ + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +} diff --git a/crypto_kem/mceliece348864f/sse/scalars_2x.inc b/crypto_kem/mceliece348864f/sse/scalars_2x.inc new file mode 100644 index 00000000..55ba6c26 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/scalars_2x.inc @@ -0,0 +1,70 @@ +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xf3cfc030fc30f003, 0x000c03c0c3c0330c), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3fcf0f003c00c00c, 0xf330cffcc00f33c0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x30033cc300c0c03c, 0xccf330f00f3c0333), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xccff0f3c0f30f0c0, 0xff03fff3ff0cf0c0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0300c03ff303c3f0, 0x3cc3fcf00fcc303c), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3fff3c0ff0ccccc0, 0x0f000c0fc30303f3), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xf3fff0c00f3c3cc0, 0xcf0fc3ff333ccf3c), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x3003333fffc3c000, 0x003f3fc3c0ff333f), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0ff30fffc3fff300, 0x3cc3f0f3cf0ff00f), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xffc0f300f0f0cc00, 0xf3f33cc03fc30cc0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xc0cff3fccc3cfc00, 0x3cc330cfc333f33f), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xfc3c03f0f330c000, 0x3cc0303ff3c3fffc), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x000f00000000f00f, 0x0f00f00f00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00000f00f00000f0, 0xf00000000000f000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0f00000f00000f00, 0x00000f00000000f0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xf00f00f00f000000, 0x0f00f00000f00000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00f00000000000f0, 0x000f00000f00f00f), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000f00000000, 0x00f00f00f00f0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xf00000000f00f000, 0x0f00f00000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00f00f00000f0000, 0x000000000f000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000f00000f00f00, 0x00f00000000f00f0), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x000f00f00f00f000, 0x0000f00f00000f00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00f00f0000000000, 0xf00000f00000f00f), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000f00000, 0x00000f00f00f00f0), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000ff00ff0000ff, 0xff00ffffff000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000ff000000ff00, 0xff0000ffff000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xff0000ff00ff0000, 0xffff00ffff000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xffff0000ff000000, 0xff00ffffffffff00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00ff00ff00ff0000, 0x00000000ff00ff00), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000ffffff000000, 0xffffffff00ff0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00ffff00ff000000, 0x00ffffff00ff0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xffffff0000ff0000, 0xffff00ffff00ffff), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xffff00ffff00ff00, 0xffff0000ffffffff), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000ff0000000000, 0xff00000000ff0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xffffff00ff000000, 0x000000ff00ff00ff), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00ff000000000000, 0x00ff00ff00ffff00), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x000000000000ffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00000000ffff0000, 0xffff000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xffff000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000ffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000ffff00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000ffff00000000, 0x00000000ffff0000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x00000000ffff0000), +}, +{ + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x00000000ffffffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xffffffff00000000, 0x00000000ffffffff), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0xffffffff00000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(0x0000000000000000, 0xffffffff00000000), +}, diff --git a/crypto_kem/mceliece348864f/sse/sk_gen.c b/crypto_kem/mceliece348864f/sse/sk_gen.c new file mode 100644 index 00000000..233f5a7f --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE348864F_SSE_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE348864F_SSE_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE348864F_SSE_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE348864F_SSE_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE348864F_SSE_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864F_SSE_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE348864F_SSE_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE348864F_SSE_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864f/sse/sk_gen.h b/crypto_kem/mceliece348864f/sse/sk_gen.h new file mode 100644 index 00000000..84dcfcfd --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_SK_GEN_H +#define PQCLEAN_MCELIECE348864F_SSE_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE348864F_SSE_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE348864F_SSE_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/syndrome_asm.S b/crypto_kem/mceliece348864f/sse/syndrome_asm.S new file mode 100644 index 00000000..59e9f108 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/syndrome_asm.S @@ -0,0 +1,740 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg128 pp + +# qhasm: reg128 ee + +# qhasm: reg128 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack128 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_SSE_syndrome_asm +.global PQCLEAN_MCELIECE348864F_SSE_syndrome_asm +_PQCLEAN_MCELIECE348864F_SSE_syndrome_asm: +PQCLEAN_MCELIECE348864F_SSE_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 260780 +# asm 1: add $260780,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 768 +# asm 1: mov $768,>row=int64#5 +# asm 2: mov $768,>row=%r8 +mov $768,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rsi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 96 ] +# asm 1: movdqu 96(ee=reg128#2 +# asm 2: movdqu 96(ee=%xmm1 +movdqu 96(%rdx),%xmm1 + +# qhasm: ss &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 16(pp=%xmm1 +movdqu 16(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 112 ] +# asm 1: movdqu 112(ee=reg128#3 +# asm 2: movdqu 112(ee=%xmm2 +movdqu 112(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 32(pp=%xmm1 +movdqu 32(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 128 ] +# asm 1: movdqu 128(ee=reg128#3 +# asm 2: movdqu 128(ee=%xmm2 +movdqu 128(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 48(pp=%xmm1 +movdqu 48(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 144 ] +# asm 1: movdqu 144(ee=reg128#3 +# asm 2: movdqu 144(ee=%xmm2 +movdqu 144(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 64(pp=%xmm1 +movdqu 64(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 160 ] +# asm 1: movdqu 160(ee=reg128#3 +# asm 2: movdqu 160(ee=%xmm2 +movdqu 160(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 80(pp=%xmm1 +movdqu 80(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 176 ] +# asm 1: movdqu 176(ee=reg128#3 +# asm 2: movdqu 176(ee=%xmm2 +movdqu 176(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 96(pp=%xmm1 +movdqu 96(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 192 ] +# asm 1: movdqu 192(ee=reg128#3 +# asm 2: movdqu 192(ee=%xmm2 +movdqu 192(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 112(pp=%xmm1 +movdqu 112(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 208 ] +# asm 1: movdqu 208(ee=reg128#3 +# asm 2: movdqu 208(ee=%xmm2 +movdqu 208(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 128(pp=%xmm1 +movdqu 128(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 224 ] +# asm 1: movdqu 224(ee=reg128#3 +# asm 2: movdqu 224(ee=%xmm2 +movdqu 224(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 144(pp=%xmm1 +movdqu 144(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 240 ] +# asm 1: movdqu 240(ee=reg128#3 +# asm 2: movdqu 240(ee=%xmm2 +movdqu 240(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 160(pp=%xmm1 +movdqu 160(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 256 ] +# asm 1: movdqu 256(ee=reg128#3 +# asm 2: movdqu 256(ee=%xmm2 +movdqu 256(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 176(pp=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 272 ] +# asm 1: movdqu 272(ee=reg128#3 +# asm 2: movdqu 272(ee=%xmm2 +movdqu 272(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 192(pp=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 288 ] +# asm 1: movdqu 288(ee=reg128#3 +# asm 2: movdqu 288(ee=%xmm2 +movdqu 288(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 208(pp=%xmm1 +movdqu 208(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 304 ] +# asm 1: movdqu 304(ee=reg128#3 +# asm 2: movdqu 304(ee=%xmm2 +movdqu 304(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 224(pp=%xmm1 +movdqu 224(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 320 ] +# asm 1: movdqu 320(ee=reg128#3 +# asm 2: movdqu 320(ee=%xmm2 +movdqu 320(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 240(pp=%xmm1 +movdqu 240(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 336 ] +# asm 1: movdqu 336(ee=reg128#3 +# asm 2: movdqu 336(ee=%xmm2 +movdqu 336(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 256(pp=%xmm1 +movdqu 256(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 352 ] +# asm 1: movdqu 352(ee=reg128#3 +# asm 2: movdqu 352(ee=%xmm2 +movdqu 352(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 272(pp=%xmm1 +movdqu 272(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 368 ] +# asm 1: movdqu 368(ee=reg128#3 +# asm 2: movdqu 368(ee=%xmm2 +movdqu 368(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 288(pp=%xmm1 +movdqu 288(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 384 ] +# asm 1: movdqu 384(ee=reg128#3 +# asm 2: movdqu 384(ee=%xmm2 +movdqu 384(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 304(pp=%xmm1 +movdqu 304(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 400 ] +# asm 1: movdqu 400(ee=reg128#3 +# asm 2: movdqu 400(ee=%xmm2 +movdqu 400(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 320(pp=%xmm1 +movdqu 320(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 416 ] +# asm 1: movdqu 416(ee=reg128#3 +# asm 2: movdqu 416(ee=%xmm2 +movdqu 416(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand buf=stack128#1 +# asm 2: movdqa buf=0(%rsp) +movdqa %xmm0,0(%rsp) + +# qhasm: s = *(uint32 *)(input_1 + 336) +# asm 1: movl 336(s=int64#6d +# asm 2: movl 336(s=%r9d +movl 336(%rsi),%r9d + +# qhasm: e = *(uint32 *)(input_2 + 432) +# asm 1: movl 432(e=int64#7d +# asm 2: movl 432(e=%eax +movl 432(%rdx),%eax + +# qhasm: s &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(ee=reg128#2 +# asm 2: movdqu 0(ee=%xmm1 +movdqu 0(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 16(ss=%xmm0 +movdqu 16(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 16 ] +# asm 1: movdqu 16(ee=reg128#2 +# asm 2: movdqu 16(ee=%xmm1 +movdqu 16(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 32(ss=%xmm0 +movdqu 32(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 32 ] +# asm 1: movdqu 32(ee=reg128#2 +# asm 2: movdqu 32(ee=%xmm1 +movdqu 32(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 48(ss=%xmm0 +movdqu 48(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 48 ] +# asm 1: movdqu 48(ee=reg128#2 +# asm 2: movdqu 48(ee=%xmm1 +movdqu 48(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 64(ss=%xmm0 +movdqu 64(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 64 ] +# asm 1: movdqu 64(ee=reg128#2 +# asm 2: movdqu 64(ee=%xmm1 +movdqu 64(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 80(ss=%xmm0 +movdqu 80(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 80 ] +# asm 1: movdqu 80(ee=reg128#2 +# asm 2: movdqu 80(ee=%xmm1 +movdqu 80(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor + +void PQCLEAN_MCELIECE348864F_SSE_transpose_64x64(uint64_t *in); +void PQCLEAN_MCELIECE348864F_SSE_transpose_64x128_sp(vec128 *in); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/transpose_64x128_sp_asm.S b/crypto_kem/mceliece348864f/sse/transpose_64x128_sp_asm.S new file mode 100644 index 00000000..9565833b --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/transpose_64x128_sp_asm.S @@ -0,0 +1,8145 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 x0 + +# qhasm: reg128 x1 + +# qhasm: reg128 x2 + +# qhasm: reg128 x3 + +# qhasm: reg128 x4 + +# qhasm: reg128 x5 + +# qhasm: reg128 x6 + +# qhasm: reg128 x7 + +# qhasm: reg128 t0 + +# qhasm: reg128 t1 + +# qhasm: reg128 v00 + +# qhasm: reg128 v01 + +# qhasm: reg128 v10 + +# qhasm: reg128 v11 + +# qhasm: reg128 mask0 + +# qhasm: reg128 mask1 + +# qhasm: reg128 mask2 + +# qhasm: reg128 mask3 + +# qhasm: reg128 mask4 + +# qhasm: reg128 mask5 + +# qhasm: enter transpose_64x128_sp_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_SSE_transpose_64x128_sp_asm +.global PQCLEAN_MCELIECE348864F_SSE_transpose_64x128_sp_asm +_PQCLEAN_MCELIECE348864F_SSE_transpose_64x128_sp_asm: +PQCLEAN_MCELIECE348864F_SSE_transpose_64x128_sp_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: mask0 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK5_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK5_0(%rip),>mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 64 ] x2 +# asm 1: movddup 64(r1=reg128#8 +# asm 2: movddup 64(r1=%xmm7 +movddup 64(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 128 ] x2 +# asm 1: movddup 128(r2=reg128#9 +# asm 2: movddup 128(r2=%xmm8 +movddup 128(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 192 ] x2 +# asm 1: movddup 192(r3=reg128#10 +# asm 2: movddup 192(r3=%xmm9 +movddup 192(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 256 ] x2 +# asm 1: movddup 256(r4=reg128#11 +# asm 2: movddup 256(r4=%xmm10 +movddup 256(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 320 ] x2 +# asm 1: movddup 320(r5=reg128#12 +# asm 2: movddup 320(r5=%xmm11 +movddup 320(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 384 ] x2 +# asm 1: movddup 384(r6=reg128#13 +# asm 2: movddup 384(r6=%xmm12 +movddup 384(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 448 ] x2 +# asm 1: movddup 448(r7=reg128#14 +# asm 2: movddup 448(r7=%xmm13 +movddup 448(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 0 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 64 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 128 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 192 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 256 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 320 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 384 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 448 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 8(r0=%xmm6 +movddup 8(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r2=reg128#9 +# asm 2: movddup 136(r2=%xmm8 +movddup 136(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r3=reg128#10 +# asm 2: movddup 200(r3=%xmm9 +movddup 200(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r4=reg128#11 +# asm 2: movddup 264(r4=%xmm10 +movddup 264(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r5=reg128#12 +# asm 2: movddup 328(r5=%xmm11 +movddup 328(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r6=reg128#13 +# asm 2: movddup 392(r6=%xmm12 +movddup 392(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r7=reg128#14 +# asm 2: movddup 456(r7=%xmm13 +movddup 456(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 8 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 72 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 136 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 200 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 264 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 328 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 392 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 456 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 16(r0=%xmm6 +movddup 16(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r1=reg128#8 +# asm 2: movddup 80(r1=%xmm7 +movddup 80(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r3=reg128#10 +# asm 2: movddup 208(r3=%xmm9 +movddup 208(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r4=reg128#11 +# asm 2: movddup 272(r4=%xmm10 +movddup 272(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r5=reg128#12 +# asm 2: movddup 336(r5=%xmm11 +movddup 336(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r6=reg128#13 +# asm 2: movddup 400(r6=%xmm12 +movddup 400(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r7=reg128#14 +# asm 2: movddup 464(r7=%xmm13 +movddup 464(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 16 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 80 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 144 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 208 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 272 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 336 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 400 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 464 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 24(r0=%xmm6 +movddup 24(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r1=reg128#8 +# asm 2: movddup 88(r1=%xmm7 +movddup 88(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r2=reg128#9 +# asm 2: movddup 152(r2=%xmm8 +movddup 152(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r4=reg128#11 +# asm 2: movddup 280(r4=%xmm10 +movddup 280(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r5=reg128#12 +# asm 2: movddup 344(r5=%xmm11 +movddup 344(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r6=reg128#13 +# asm 2: movddup 408(r6=%xmm12 +movddup 408(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r7=reg128#14 +# asm 2: movddup 472(r7=%xmm13 +movddup 472(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 24 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 88 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 152 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 216 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 280 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 344 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 408 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 472 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 32(r0=%xmm6 +movddup 32(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r1=reg128#8 +# asm 2: movddup 96(r1=%xmm7 +movddup 96(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r2=reg128#9 +# asm 2: movddup 160(r2=%xmm8 +movddup 160(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r3=reg128#10 +# asm 2: movddup 224(r3=%xmm9 +movddup 224(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r5=reg128#12 +# asm 2: movddup 352(r5=%xmm11 +movddup 352(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r6=reg128#13 +# asm 2: movddup 416(r6=%xmm12 +movddup 416(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r7=reg128#14 +# asm 2: movddup 480(r7=%xmm13 +movddup 480(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 32 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 96 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 160 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 224 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 288 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 352 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 416 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 480 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 40(r0=%xmm6 +movddup 40(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r1=reg128#8 +# asm 2: movddup 104(r1=%xmm7 +movddup 104(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r2=reg128#9 +# asm 2: movddup 168(r2=%xmm8 +movddup 168(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r3=reg128#10 +# asm 2: movddup 232(r3=%xmm9 +movddup 232(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r4=reg128#11 +# asm 2: movddup 296(r4=%xmm10 +movddup 296(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r6=reg128#13 +# asm 2: movddup 424(r6=%xmm12 +movddup 424(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r7=reg128#14 +# asm 2: movddup 488(r7=%xmm13 +movddup 488(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 40 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 104 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 168 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 232 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 296 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 360 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 424 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 488 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 48(r0=%xmm6 +movddup 48(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r1=reg128#8 +# asm 2: movddup 112(r1=%xmm7 +movddup 112(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r2=reg128#9 +# asm 2: movddup 176(r2=%xmm8 +movddup 176(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r3=reg128#10 +# asm 2: movddup 240(r3=%xmm9 +movddup 240(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r4=reg128#11 +# asm 2: movddup 304(r4=%xmm10 +movddup 304(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r5=reg128#12 +# asm 2: movddup 368(r5=%xmm11 +movddup 368(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r7=reg128#14 +# asm 2: movddup 496(r7=%xmm13 +movddup 496(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 48 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 112 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 176 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 240 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 304 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 368 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 432 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 496 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 56(r0=%xmm6 +movddup 56(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r1=reg128#8 +# asm 2: movddup 120(r1=%xmm7 +movddup 120(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r2=reg128#9 +# asm 2: movddup 184(r2=%xmm8 +movddup 184(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r3=reg128#10 +# asm 2: movddup 248(r3=%xmm9 +movddup 248(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r4=reg128#11 +# asm 2: movddup 312(r4=%xmm10 +movddup 312(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r5=reg128#12 +# asm 2: movddup 376(r5=%xmm11 +movddup 376(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r6=reg128#13 +# asm 2: movddup 440(r6=%xmm12 +movddup 440(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm3,%rsi + +# qhasm: mem64[ input_0 + 56 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm7,%rsi + +# qhasm: mem64[ input_0 + 120 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 184 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm0,%rsi + +# qhasm: mem64[ input_0 + 248 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 312 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm2,%rsi + +# qhasm: mem64[ input_0 + 376 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm4,%rsi + +# qhasm: mem64[ input_0 + 440 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm1,%rsi + +# qhasm: mem64[ input_0 + 504 ] = buf +# asm 1: movq mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864F_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864F_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 8 ] x2 +# asm 1: movddup 8(r1=reg128#8 +# asm 2: movddup 8(r1=%xmm7 +movddup 8(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 16 ] x2 +# asm 1: movddup 16(r2=reg128#9 +# asm 2: movddup 16(r2=%xmm8 +movddup 16(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 24 ] x2 +# asm 1: movddup 24(r3=reg128#10 +# asm 2: movddup 24(r3=%xmm9 +movddup 24(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 32 ] x2 +# asm 1: movddup 32(r4=reg128#11 +# asm 2: movddup 32(r4=%xmm10 +movddup 32(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 40 ] x2 +# asm 1: movddup 40(r5=reg128#12 +# asm 2: movddup 40(r5=%xmm11 +movddup 40(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 48 ] x2 +# asm 1: movddup 48(r6=reg128#13 +# asm 2: movddup 48(r6=%xmm12 +movddup 48(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 56 ] x2 +# asm 1: movddup 56(r7=reg128#14 +# asm 2: movddup 56(r7=%xmm13 +movddup 56(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 0 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 16 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 32 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 64(r0=%xmm6 +movddup 64(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r2=reg128#9 +# asm 2: movddup 80(r2=%xmm8 +movddup 80(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r3=reg128#10 +# asm 2: movddup 88(r3=%xmm9 +movddup 88(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r4=reg128#11 +# asm 2: movddup 96(r4=%xmm10 +movddup 96(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r5=reg128#12 +# asm 2: movddup 104(r5=%xmm11 +movddup 104(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r6=reg128#13 +# asm 2: movddup 112(r6=%xmm12 +movddup 112(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r7=reg128#14 +# asm 2: movddup 120(r7=%xmm13 +movddup 120(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 64 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 80 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 96 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 112 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 128(r0=%xmm6 +movddup 128(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r1=reg128#8 +# asm 2: movddup 136(r1=%xmm7 +movddup 136(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r3=reg128#10 +# asm 2: movddup 152(r3=%xmm9 +movddup 152(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r4=reg128#11 +# asm 2: movddup 160(r4=%xmm10 +movddup 160(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r5=reg128#12 +# asm 2: movddup 168(r5=%xmm11 +movddup 168(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r6=reg128#13 +# asm 2: movddup 176(r6=%xmm12 +movddup 176(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r7=reg128#14 +# asm 2: movddup 184(r7=%xmm13 +movddup 184(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 128 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 144 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 160 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 176 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 192(r0=%xmm6 +movddup 192(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r1=reg128#8 +# asm 2: movddup 200(r1=%xmm7 +movddup 200(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r2=reg128#9 +# asm 2: movddup 208(r2=%xmm8 +movddup 208(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r4=reg128#11 +# asm 2: movddup 224(r4=%xmm10 +movddup 224(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r5=reg128#12 +# asm 2: movddup 232(r5=%xmm11 +movddup 232(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r6=reg128#13 +# asm 2: movddup 240(r6=%xmm12 +movddup 240(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r7=reg128#14 +# asm 2: movddup 248(r7=%xmm13 +movddup 248(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 192 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 208 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 224 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 240 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 256(r0=%xmm6 +movddup 256(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r1=reg128#8 +# asm 2: movddup 264(r1=%xmm7 +movddup 264(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r2=reg128#9 +# asm 2: movddup 272(r2=%xmm8 +movddup 272(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r3=reg128#10 +# asm 2: movddup 280(r3=%xmm9 +movddup 280(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r5=reg128#12 +# asm 2: movddup 296(r5=%xmm11 +movddup 296(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r6=reg128#13 +# asm 2: movddup 304(r6=%xmm12 +movddup 304(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r7=reg128#14 +# asm 2: movddup 312(r7=%xmm13 +movddup 312(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 256 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 272 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 288 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 304 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 320(r0=%xmm6 +movddup 320(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r1=reg128#8 +# asm 2: movddup 328(r1=%xmm7 +movddup 328(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r2=reg128#9 +# asm 2: movddup 336(r2=%xmm8 +movddup 336(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r3=reg128#10 +# asm 2: movddup 344(r3=%xmm9 +movddup 344(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r4=reg128#11 +# asm 2: movddup 352(r4=%xmm10 +movddup 352(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r6=reg128#13 +# asm 2: movddup 368(r6=%xmm12 +movddup 368(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r7=reg128#14 +# asm 2: movddup 376(r7=%xmm13 +movddup 376(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 320 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 336 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 352 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 368 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 384(r0=%xmm6 +movddup 384(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r1=reg128#8 +# asm 2: movddup 392(r1=%xmm7 +movddup 392(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r2=reg128#9 +# asm 2: movddup 400(r2=%xmm8 +movddup 400(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r3=reg128#10 +# asm 2: movddup 408(r3=%xmm9 +movddup 408(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r4=reg128#11 +# asm 2: movddup 416(r4=%xmm10 +movddup 416(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r5=reg128#12 +# asm 2: movddup 424(r5=%xmm11 +movddup 424(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r7=reg128#14 +# asm 2: movddup 440(r7=%xmm13 +movddup 440(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 384 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 400 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 416 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 432 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 448(r0=%xmm6 +movddup 448(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r1=reg128#8 +# asm 2: movddup 456(r1=%xmm7 +movddup 456(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r2=reg128#9 +# asm 2: movddup 464(r2=%xmm8 +movddup 464(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r3=reg128#10 +# asm 2: movddup 472(r3=%xmm9 +movddup 472(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r4=reg128#11 +# asm 2: movddup 480(r4=%xmm10 +movddup 480(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r5=reg128#12 +# asm 2: movddup 488(r5=%xmm11 +movddup 488(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r6=reg128#13 +# asm 2: movddup 496(r6=%xmm12 +movddup 496(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#4 +# asm 2: vpunpcklqdq t0=%xmm3 +vpunpcklqdq %xmm7,%xmm3,%xmm3 + +# qhasm: mem128[ input_0 + 448 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm0,%xmm8,%xmm0 + +# qhasm: mem128[ input_0 + 464 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm2,%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 480 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm1,%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 496 ] = t0 +# asm 1: movdqu s1=int64#2 +# asm 2: mov s1=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864F_SSE_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE348864F_SSE_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE348864F_SSE_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE348864F_SSE_irr_load(uint64_t *out, const unsigned char *in) { + int i, j; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE348864F_SSE_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + out[i] = 0; + } + + for (i = SYS_T; i >= 0; i--) { + for (j = 0; j < GFBITS; j++) { + out[j] <<= 1; + out[j] |= (irr[i] >> j) & 1; + } + } +} + +void PQCLEAN_MCELIECE348864F_SSE_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE348864F_SSE_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE348864F_SSE_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 4; +} + +vec128 PQCLEAN_MCELIECE348864F_SSE_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE348864F_SSE_vec128_set2x( PQCLEAN_MCELIECE348864F_SSE_load8(in), PQCLEAN_MCELIECE348864F_SSE_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE348864F_SSE_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE348864F_SSE_store8(out + 0, PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in, 0)); + PQCLEAN_MCELIECE348864F_SSE_store8(out + 8, PQCLEAN_MCELIECE348864F_SSE_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece348864f/sse/util.h b/crypto_kem/mceliece348864f/sse/util.h new file mode 100644 index 00000000..d7c14805 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/util.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_UTIL_H +#define PQCLEAN_MCELIECE348864F_SSE_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE348864F_SSE_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE348864F_SSE_store2(unsigned char *dest, gf a); + +uint16_t PQCLEAN_MCELIECE348864F_SSE_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE348864F_SSE_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE348864F_SSE_irr_load(uint64_t *out, const unsigned char *in); + +void PQCLEAN_MCELIECE348864F_SSE_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE348864F_SSE_load8(const unsigned char *in); + +gf PQCLEAN_MCELIECE348864F_SSE_bitrev(gf a); + +vec128 PQCLEAN_MCELIECE348864F_SSE_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE348864F_SSE_store16(unsigned char *out, vec128 in); + +#endif + diff --git a/crypto_kem/mceliece348864f/sse/vec.c b/crypto_kem/mceliece348864f/sse/vec.c new file mode 100644 index 00000000..5745ff95 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/vec.c @@ -0,0 +1,17 @@ + +#include "vec.h" + +#include "params.h" + +void PQCLEAN_MCELIECE348864F_SSE_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g) { + PQCLEAN_MCELIECE348864F_SSE_vec_mul_asm(h, f, g, 8); +} + +void PQCLEAN_MCELIECE348864F_SSE_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g) { + int b; + + for (b = 0; b < GFBITS; b++) { + h[b] = f[b] ^ g[b]; + } +} + diff --git a/crypto_kem/mceliece348864f/sse/vec.h b/crypto_kem/mceliece348864f/sse/vec.h new file mode 100644 index 00000000..ffdc28be --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/vec.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_VEC_H +#define PQCLEAN_MCELIECE348864F_SSE_VEC_H + +#include + +extern void PQCLEAN_MCELIECE348864F_SSE_vec_mul_asm(uint64_t *, const uint64_t *, const uint64_t *, int); + +void PQCLEAN_MCELIECE348864F_SSE_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g); +void PQCLEAN_MCELIECE348864F_SSE_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g); + +#endif diff --git a/crypto_kem/mceliece348864f/sse/vec128.c b/crypto_kem/mceliece348864f/sse/vec128.c new file mode 100644 index 00000000..270806f3 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/vec128.c @@ -0,0 +1,143 @@ +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + +#include "vec128.h" + +#include "params.h" + +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE348864F_SSE_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE348864F_SSE_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE348864F_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE348864F_SSE_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE348864F_SSE_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864F_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE348864F_SSE_vec128_mul_asm(h, f, g, 16); +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE348864F_SSE_vec128_sq(vec128 *out, const vec128 *in) { + int i; + vec128 result[GFBITS]; + + result[0] = in[0] ^ in[6]; + result[1] = in[11]; + result[2] = in[1] ^ in[7]; + result[3] = in[6]; + result[4] = in[2] ^ in[11] ^ in[8]; + result[5] = in[7]; + result[6] = in[3] ^ in[9]; + result[7] = in[8]; + result[8] = in[4] ^ in[10]; + result[9] = in[9]; + result[10] = in[5] ^ in[11]; + result[11] = in[10]; + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE348864F_SSE_vec128_inv(vec128 *out, const vec128 *in) { + vec128 tmp_11[ GFBITS ]; + vec128 tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE348864F_SSE_vec128_copy(out, in); + + PQCLEAN_MCELIECE348864F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(tmp_11, out, in); // 11 + + PQCLEAN_MCELIECE348864F_SSE_vec128_sq(out, tmp_11); + PQCLEAN_MCELIECE348864F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(tmp_1111, out, tmp_11); // 1111 + + PQCLEAN_MCELIECE348864F_SSE_vec128_sq(out, tmp_1111); + PQCLEAN_MCELIECE348864F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(out, out, tmp_1111); // 11111111 + + PQCLEAN_MCELIECE348864F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(out, out, tmp_11); // 1111111111 + + PQCLEAN_MCELIECE348864F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864F_SSE_vec128_mul(out, out, in); // 11111111111 + + PQCLEAN_MCELIECE348864F_SSE_vec128_sq(out, out); // 111111111110 +} + diff --git a/crypto_kem/mceliece348864f/sse/vec128.h b/crypto_kem/mceliece348864f/sse/vec128.h new file mode 100644 index 00000000..236de8d3 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/vec128.h @@ -0,0 +1,42 @@ +#ifndef PQCLEAN_MCELIECE348864F_SSE_VEC128_H +#define PQCLEAN_MCELIECE348864F_SSE_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE348864F_SSE_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE348864F_SSE_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE348864F_SSE_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE348864F_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE348864F_SSE_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE348864F_SSE_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864F_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +void PQCLEAN_MCELIECE348864F_SSE_vec128_sq(vec128 * /*out*/, const vec128 * /*in*/); +void PQCLEAN_MCELIECE348864F_SSE_vec128_inv(vec128 * /*out*/, const vec128 * /*in*/); + +#endif diff --git a/crypto_kem/mceliece348864f/sse/vec128_mul_asm.S b/crypto_kem/mceliece348864f/sse/vec128_mul_asm.S new file mode 100644 index 00000000..7eba1915 --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/vec128_mul_asm.S @@ -0,0 +1,1736 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 a0 + +# qhasm: reg128 a1 + +# qhasm: reg128 a2 + +# qhasm: reg128 a3 + +# qhasm: reg128 a4 + +# qhasm: reg128 a5 + +# qhasm: reg128 a6 + +# qhasm: reg128 a7 + +# qhasm: reg128 a8 + +# qhasm: reg128 a9 + +# qhasm: reg128 a10 + +# qhasm: reg128 a11 + +# qhasm: reg128 b0 + +# qhasm: reg128 b1 + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 r5 + +# qhasm: reg128 r6 + +# qhasm: reg128 r7 + +# qhasm: reg128 r8 + +# qhasm: reg128 r9 + +# qhasm: reg128 r10 + +# qhasm: reg128 r11 + +# qhasm: reg128 r12 + +# qhasm: reg128 r13 + +# qhasm: reg128 r14 + +# qhasm: reg128 r15 + +# qhasm: reg128 r16 + +# qhasm: reg128 r17 + +# qhasm: reg128 r18 + +# qhasm: reg128 r19 + +# qhasm: reg128 r20 + +# qhasm: reg128 r21 + +# qhasm: reg128 r22 + +# qhasm: reg128 r + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_SSE_vec128_mul_asm +.global PQCLEAN_MCELIECE348864F_SSE_vec128_mul_asm +_PQCLEAN_MCELIECE348864F_SSE_vec128_mul_asm: +PQCLEAN_MCELIECE348864F_SSE_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(b0=reg128#1 +# asm 2: movdqu 0(b0=%xmm0 +movdqu 0(%rdx),%xmm0 + +# qhasm: a11 = mem128[ input_1 + 176 ] +# asm 1: movdqu 176(a11=reg128#2 +# asm 2: movdqu 176(a11=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: r11 = a11 & b0 +# asm 1: vpand r11=reg128#3 +# asm 2: vpand r11=%xmm2 +vpand %xmm0,%xmm1,%xmm2 + +# qhasm: r12 = a11 & mem128[input_2 + 16] +# asm 1: vpand 16(r12=reg128#4 +# asm 2: vpand 16(r12=%xmm3 +vpand 16(%rdx),%xmm1,%xmm3 + +# qhasm: r13 = a11 & mem128[input_2 + 32] +# asm 1: vpand 32(r13=reg128#5 +# asm 2: vpand 32(r13=%xmm4 +vpand 32(%rdx),%xmm1,%xmm4 + +# qhasm: r14 = a11 & mem128[input_2 + 48] +# asm 1: vpand 48(r14=reg128#6 +# asm 2: vpand 48(r14=%xmm5 +vpand 48(%rdx),%xmm1,%xmm5 + +# qhasm: r15 = a11 & mem128[input_2 + 64] +# asm 1: vpand 64(r15=reg128#7 +# asm 2: vpand 64(r15=%xmm6 +vpand 64(%rdx),%xmm1,%xmm6 + +# qhasm: r16 = a11 & mem128[input_2 + 80] +# asm 1: vpand 80(r16=reg128#8 +# asm 2: vpand 80(r16=%xmm7 +vpand 80(%rdx),%xmm1,%xmm7 + +# qhasm: r17 = a11 & mem128[input_2 + 96] +# asm 1: vpand 96(r17=reg128#9 +# asm 2: vpand 96(r17=%xmm8 +vpand 96(%rdx),%xmm1,%xmm8 + +# qhasm: r18 = a11 & mem128[input_2 + 112] +# asm 1: vpand 112(r18=reg128#10 +# asm 2: vpand 112(r18=%xmm9 +vpand 112(%rdx),%xmm1,%xmm9 + +# qhasm: r19 = a11 & mem128[input_2 + 128] +# asm 1: vpand 128(r19=reg128#11 +# asm 2: vpand 128(r19=%xmm10 +vpand 128(%rdx),%xmm1,%xmm10 + +# qhasm: r20 = a11 & mem128[input_2 + 144] +# asm 1: vpand 144(r20=reg128#12 +# asm 2: vpand 144(r20=%xmm11 +vpand 144(%rdx),%xmm1,%xmm11 + +# qhasm: r21 = a11 & mem128[input_2 + 160] +# asm 1: vpand 160(r21=reg128#13 +# asm 2: vpand 160(r21=%xmm12 +vpand 160(%rdx),%xmm1,%xmm12 + +# qhasm: r22 = a11 & mem128[input_2 + 176] +# asm 1: vpand 176(r22=reg128#2 +# asm 2: vpand 176(r22=%xmm1 +vpand 176(%rdx),%xmm1,%xmm1 + +# qhasm: r13 ^= r22 +# asm 1: pxor r10=reg128#2 +# asm 2: movdqa r10=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: a10 = mem128[ input_1 + 160 ] +# asm 1: movdqu 160(a10=reg128#14 +# asm 2: movdqu 160(a10=%xmm13 +movdqu 160(%rsi),%xmm13 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r21 ^= r +# asm 1: pxor r9=reg128#13 +# asm 2: movdqa r9=%xmm12 +movdqa %xmm12,%xmm12 + +# qhasm: a9 = mem128[ input_1 + 144 ] +# asm 1: movdqu 144(a9=reg128#14 +# asm 2: movdqu 144(a9=%xmm13 +movdqu 144(%rsi),%xmm13 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r20 ^= r +# asm 1: pxor r8=reg128#12 +# asm 2: movdqa r8=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: a8 = mem128[ input_1 + 128 ] +# asm 1: movdqu 128(a8=reg128#14 +# asm 2: movdqu 128(a8=%xmm13 +movdqu 128(%rsi),%xmm13 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r19 ^= r +# asm 1: pxor r7=reg128#11 +# asm 2: movdqa r7=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: a7 = mem128[ input_1 + 112 ] +# asm 1: movdqu 112(a7=reg128#14 +# asm 2: movdqu 112(a7=%xmm13 +movdqu 112(%rsi),%xmm13 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r18 ^= r +# asm 1: pxor r6=reg128#10 +# asm 2: movdqa r6=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: a6 = mem128[ input_1 + 96 ] +# asm 1: movdqu 96(a6=reg128#14 +# asm 2: movdqu 96(a6=%xmm13 +movdqu 96(%rsi),%xmm13 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r17 ^= r +# asm 1: pxor r5=reg128#9 +# asm 2: movdqa r5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: a5 = mem128[ input_1 + 80 ] +# asm 1: movdqu 80(a5=reg128#14 +# asm 2: movdqu 80(a5=%xmm13 +movdqu 80(%rsi),%xmm13 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r16 ^= r +# asm 1: pxor r4=reg128#8 +# asm 2: movdqa r4=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: a4 = mem128[ input_1 + 64 ] +# asm 1: movdqu 64(a4=reg128#14 +# asm 2: movdqu 64(a4=%xmm13 +movdqu 64(%rsi),%xmm13 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r15 ^= r +# asm 1: pxor r3=reg128#7 +# asm 2: movdqa r3=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: a3 = mem128[ input_1 + 48 ] +# asm 1: movdqu 48(a3=reg128#14 +# asm 2: movdqu 48(a3=%xmm13 +movdqu 48(%rsi),%xmm13 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r14 ^= r +# asm 1: pxor r2=reg128#6 +# asm 2: movdqa r2=%xmm5 +movdqa %xmm5,%xmm5 + +# qhasm: a2 = mem128[ input_1 + 32 ] +# asm 1: movdqu 32(a2=reg128#14 +# asm 2: movdqu 32(a2=%xmm13 +movdqu 32(%rsi),%xmm13 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r13 ^= r +# asm 1: pxor r1=reg128#5 +# asm 2: movdqa r1=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: a1 = mem128[ input_1 + 16 ] +# asm 1: movdqu 16(a1=reg128#14 +# asm 2: movdqu 16(a1=%xmm13 +movdqu 16(%rsi),%xmm13 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r12 ^= r +# asm 1: pxor r0=reg128#4 +# asm 2: movdqa r0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: a0 = mem128[ input_1 + 0 ] +# asm 1: movdqu 0(a0=reg128#14 +# asm 2: movdqu 0(a0=%xmm13 +movdqu 0(%rsi),%xmm13 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: r0 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 16(r=%xmm0 +vpand 16(%rdx),%xmm13,%xmm0 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 32(r=%xmm0 +vpand 32(%rdx),%xmm13,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 48(r=%xmm0 +vpand 48(%rdx),%xmm13,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 64(r=%xmm0 +vpand 64(%rdx),%xmm13,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 80(r=%xmm0 +vpand 80(%rdx),%xmm13,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 96(r=%xmm0 +vpand 96(%rdx),%xmm13,%xmm0 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 112(r=%xmm0 +vpand 112(%rdx),%xmm13,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 128(r=%xmm0 +vpand 128(%rdx),%xmm13,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 144(r=%xmm0 +vpand 144(%rdx),%xmm13,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 160(r=%xmm0 +vpand 160(%rdx),%xmm13,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 176(r=%xmm0 +vpand 176(%rdx),%xmm13,%xmm0 + +# qhasm: r11 ^= r +# asm 1: pxor r11_stack=stack64#1 +# asm 2: movq r11_stack=608(%rsp) +movq %r11,608(%rsp) + +# qhasm: r12_stack = caller_r12 +# asm 1: movq r12_stack=stack64#2 +# asm 2: movq r12_stack=616(%rsp) +movq %r12,616(%rsp) + +# qhasm: r13_stack = caller_r13 +# asm 1: movq r13_stack=stack64#3 +# asm 2: movq r13_stack=624(%rsp) +movq %r13,624(%rsp) + +# qhasm: r14_stack = caller_r14 +# asm 1: movq r14_stack=stack64#4 +# asm 2: movq r14_stack=632(%rsp) +movq %r14,632(%rsp) + +# qhasm: r15_stack = caller_r15 +# asm 1: movq r15_stack=stack64#5 +# asm 2: movq r15_stack=640(%rsp) +movq %r15,640(%rsp) + +# qhasm: rbx_stack = caller_rbx +# asm 1: movq rbx_stack=stack64#6 +# asm 2: movq rbx_stack=648(%rsp) +movq %rbx,648(%rsp) + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 11 +# asm 1: imulq $11,tmp=int64#6 +# asm 2: imulq $11,tmp=%r9 +imulq $11,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b11=reg128#1 +# asm 2: movddup 0(b11=%xmm0 +movddup 0(%rdx),%xmm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub r16=reg128#3 +# asm 2: vpand r16=%xmm2 +vpand %xmm1,%xmm0,%xmm2 + +# qhasm: mem128[ ptr + 256 ] = r16 +# asm 1: movdqu r15=reg128#4 +# asm 2: vpand r15=%xmm3 +vpand %xmm2,%xmm0,%xmm3 + +# qhasm: a3[0] = mem64[ input_1 + 24 ] +# asm 1: pinsrq $0x0,24(r14=reg128#6 +# asm 2: vpand r14=%xmm5 +vpand %xmm4,%xmm0,%xmm5 + +# qhasm: a2[0] = mem64[ input_1 + 16 ] +# asm 1: pinsrq $0x0,16(r13=reg128#8 +# asm 2: vpand r13=%xmm7 +vpand %xmm6,%xmm0,%xmm7 + +# qhasm: a1[0] = mem64[ input_1 + 8 ] +# asm 1: pinsrq $0x0,8(r12=reg128#10 +# asm 2: vpand r12=%xmm9 +vpand %xmm8,%xmm0,%xmm9 + +# qhasm: a0[0] = mem64[ input_1 + 0 ] +# asm 1: pinsrq $0x0,0(r11=reg128#1 +# asm 2: vpand r11=%xmm0 +vpand %xmm10,%xmm0,%xmm0 + +# qhasm: b10 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b10=reg128#12 +# asm 2: movddup 0(b10=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm2,%xmm11,%xmm3 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm4,%xmm11,%xmm3 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm6,%xmm11,%xmm3 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm8,%xmm11,%xmm3 + +# qhasm: r11 ^= r +# asm 1: pxor r10=reg128#4 +# asm 2: vpand r10=%xmm3 +vpand %xmm10,%xmm11,%xmm3 + +# qhasm: b9 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b9=reg128#12 +# asm 2: movddup 0(b9=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm2,%xmm11,%xmm5 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm4,%xmm11,%xmm5 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm6,%xmm11,%xmm5 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm8,%xmm11,%xmm5 + +# qhasm: r10 ^= r +# asm 1: pxor r9=reg128#6 +# asm 2: vpand r9=%xmm5 +vpand %xmm10,%xmm11,%xmm5 + +# qhasm: b8 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b8=reg128#12 +# asm 2: movddup 0(b8=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm2,%xmm11,%xmm7 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm4,%xmm11,%xmm7 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm6,%xmm11,%xmm7 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm8,%xmm11,%xmm7 + +# qhasm: r9 ^= r +# asm 1: pxor r8=reg128#8 +# asm 2: vpand r8=%xmm7 +vpand %xmm10,%xmm11,%xmm7 + +# qhasm: b7 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b7=reg128#12 +# asm 2: movddup 0(b7=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm2,%xmm11,%xmm9 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm4,%xmm11,%xmm9 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm6,%xmm11,%xmm9 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm8,%xmm11,%xmm9 + +# qhasm: r8 ^= r +# asm 1: pxor r7=reg128#10 +# asm 2: vpand r7=%xmm9 +vpand %xmm10,%xmm11,%xmm9 + +# qhasm: b6 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b6=reg128#12 +# asm 2: movddup 0(b6=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm2,%xmm11,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm4,%xmm11,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm6,%xmm11,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm8,%xmm11,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r6=reg128#1 +# asm 2: vpand r6=%xmm0 +vpand %xmm10,%xmm11,%xmm0 + +# qhasm: b5 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b5=reg128#12 +# asm 2: movddup 0(b5=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm2,%xmm11,%xmm3 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm4,%xmm11,%xmm3 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm6,%xmm11,%xmm3 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm8,%xmm11,%xmm3 + +# qhasm: r6 ^= r +# asm 1: pxor r5=reg128#4 +# asm 2: vpand r5=%xmm3 +vpand %xmm10,%xmm11,%xmm3 + +# qhasm: b4 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b4=reg128#12 +# asm 2: movddup 0(b4=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm2,%xmm11,%xmm5 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm4,%xmm11,%xmm5 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm6,%xmm11,%xmm5 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm8,%xmm11,%xmm5 + +# qhasm: r5 ^= r +# asm 1: pxor r4=reg128#6 +# asm 2: vpand r4=%xmm5 +vpand %xmm10,%xmm11,%xmm5 + +# qhasm: b3 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b3=reg128#12 +# asm 2: movddup 0(b3=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm2,%xmm11,%xmm7 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm4,%xmm11,%xmm7 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm6,%xmm11,%xmm7 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm8,%xmm11,%xmm7 + +# qhasm: r4 ^= r +# asm 1: pxor r3=reg128#8 +# asm 2: vpand r3=%xmm7 +vpand %xmm10,%xmm11,%xmm7 + +# qhasm: b2 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b2=reg128#12 +# asm 2: movddup 0(b2=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm2,%xmm11,%xmm9 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm4,%xmm11,%xmm9 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm6,%xmm11,%xmm9 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm8,%xmm11,%xmm9 + +# qhasm: r3 ^= r +# asm 1: pxor r2=reg128#10 +# asm 2: vpand r2=%xmm9 +vpand %xmm10,%xmm11,%xmm9 + +# qhasm: b1 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b1=reg128#12 +# asm 2: movddup 0(b1=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm2,%xmm11,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm4,%xmm11,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm6,%xmm11,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm8,%xmm11,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r1=reg128#1 +# asm 2: vpand r1=%xmm0 +vpand %xmm10,%xmm11,%xmm0 + +# qhasm: b0 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b0=reg128#12 +# asm 2: movddup 0(b0=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm1,%xmm11,%xmm1 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm2,%xmm11,%xmm1 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm4,%xmm11,%xmm1 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm6,%xmm11,%xmm1 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm8,%xmm11,%xmm1 + +# qhasm: r1 ^= r +# asm 1: pxor r0=reg128#2 +# asm 2: vpand r0=%xmm1 +vpand %xmm10,%xmm11,%xmm1 + +# qhasm: mem128[ ptr + 64 ] = r4 +# asm 1: movdqu h22=int64#2 +# asm 2: movq 264(h22=%rsi +movq 264(%r8),%rsi + +# qhasm: h13 = h22 +# asm 1: mov h13=int64#3 +# asm 2: mov h13=%rdx +mov %rsi,%rdx + +# qhasm: h10 = h22 +# asm 1: mov h10=int64#2 +# asm 2: mov h10=%rsi +mov %rsi,%rsi + +# qhasm: h21 = mem64[ ptr + 248 ] +# asm 1: movq 248(h21=int64#4 +# asm 2: movq 248(h21=%rcx +movq 248(%r8),%rcx + +# qhasm: h12 = h21 +# asm 1: mov h12=int64#6 +# asm 2: mov h12=%r9 +mov %rcx,%r9 + +# qhasm: h9 = h21 +# asm 1: mov h9=int64#4 +# asm 2: mov h9=%rcx +mov %rcx,%rcx + +# qhasm: h20 = mem64[ ptr + 232 ] +# asm 1: movq 232(h20=int64#7 +# asm 2: movq 232(h20=%rax +movq 232(%r8),%rax + +# qhasm: h11 = h20 +# asm 1: mov h11=int64#8 +# asm 2: mov h11=%r10 +mov %rax,%r10 + +# qhasm: h8 = h20 +# asm 1: mov h8=int64#7 +# asm 2: mov h8=%rax +mov %rax,%rax + +# qhasm: h19 = mem64[ ptr + 216 ] +# asm 1: movq 216(h19=int64#9 +# asm 2: movq 216(h19=%r11 +movq 216(%r8),%r11 + +# qhasm: h10 ^= h19 +# asm 1: xor h7=int64#9 +# asm 2: mov h7=%r11 +mov %r11,%r11 + +# qhasm: h18 = mem64[ ptr + 200 ] +# asm 1: movq 200(h18=int64#10 +# asm 2: movq 200(h18=%r12 +movq 200(%r8),%r12 + +# qhasm: h9 ^= h18 +# asm 1: xor h6=int64#10 +# asm 2: mov h6=%r12 +mov %r12,%r12 + +# qhasm: h17 = mem64[ ptr + 184 ] +# asm 1: movq 184(h17=int64#11 +# asm 2: movq 184(h17=%r13 +movq 184(%r8),%r13 + +# qhasm: h8 ^= h17 +# asm 1: xor h5=int64#11 +# asm 2: mov h5=%r13 +mov %r13,%r13 + +# qhasm: h16 = mem64[ ptr + 168 ] +# asm 1: movq 168(h16=int64#12 +# asm 2: movq 168(h16=%r14 +movq 168(%r8),%r14 + +# qhasm: h16 ^= *(uint64 *) ( ptr + 256 ) +# asm 1: xorq 256(h4=int64#12 +# asm 2: mov h4=%r14 +mov %r14,%r14 + +# qhasm: h15 = mem64[ ptr + 152 ] +# asm 1: movq 152(h15=int64#13 +# asm 2: movq 152(h15=%r15 +movq 152(%r8),%r15 + +# qhasm: h15 ^= *(uint64 *) ( ptr + 240 ) +# asm 1: xorq 240(h3=int64#13 +# asm 2: mov h3=%r15 +mov %r15,%r15 + +# qhasm: h14 = mem64[ ptr + 136 ] +# asm 1: movq 136(h14=int64#14 +# asm 2: movq 136(h14=%rbx +movq 136(%r8),%rbx + +# qhasm: h14 ^= *(uint64 *) ( ptr + 224 ) +# asm 1: xorq 224(h2=int64#14 +# asm 2: mov h2=%rbx +mov %rbx,%rbx + +# qhasm: h13 ^= *(uint64 *) ( ptr + 120 ) +# asm 1: xorq 120(h1=int64#3 +# asm 2: mov h1=%rdx +mov %rdx,%rdx + +# qhasm: h12 ^= *(uint64 *) ( ptr + 104 ) +# asm 1: xorq 104(h0=int64#6 +# asm 2: mov h0=%r9 +mov %r9,%r9 + +# qhasm: h11 ^= *(uint64 *) ( ptr + 176 ) +# asm 1: xorq 176(caller_r11=int64#9 +# asm 2: movq caller_r11=%r11 +movq 608(%rsp),%r11 + +# qhasm: caller_r12 = r12_stack +# asm 1: movq caller_r12=int64#10 +# asm 2: movq caller_r12=%r12 +movq 616(%rsp),%r12 + +# qhasm: caller_r13 = r13_stack +# asm 1: movq caller_r13=int64#11 +# asm 2: movq caller_r13=%r13 +movq 624(%rsp),%r13 + +# qhasm: caller_r14 = r14_stack +# asm 1: movq caller_r14=int64#12 +# asm 2: movq caller_r14=%r14 +movq 632(%rsp),%r14 + +# qhasm: caller_r15 = r15_stack +# asm 1: movq caller_r15=int64#13 +# asm 2: movq caller_r15=%r15 +movq 640(%rsp),%r15 + +# qhasm: caller_rbx = rbx_stack +# asm 1: movq caller_rbx=int64#14 +# asm 2: movq caller_rbx=%rbx +movq 648(%rsp),%rbx + +# qhasm: return +add %r11,%rsp +ret diff --git a/crypto_kem/mceliece348864f/sse/vec_reduce_asm.S b/crypto_kem/mceliece348864f/sse/vec_reduce_asm.S new file mode 100644 index 00000000..59d99dbb --- /dev/null +++ b/crypto_kem/mceliece348864f/sse/vec_reduce_asm.S @@ -0,0 +1,356 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 t + +# qhasm: int64 c + +# qhasm: int64 r + +# qhasm: enter vec_reduce_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_SSE_vec_reduce_asm +.global PQCLEAN_MCELIECE348864F_SSE_vec_reduce_asm +_PQCLEAN_MCELIECE348864F_SSE_vec_reduce_asm: +PQCLEAN_MCELIECE348864F_SSE_vec_reduce_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: r = 0 +# asm 1: mov $0,>r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t = mem64[ input_0 + 88 ] +# asm 1: movq 88(t=int64#2 +# asm 2: movq 88(t=%rsi +movq 88(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 80(t=%rsi +movq 80(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 72(t=%rsi +movq 72(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 64(t=%rsi +movq 64(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 56(t=%rsi +movq 56(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 48(t=%rsi +movq 48(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 40(t=%rsi +movq 40(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 32(t=%rsi +movq 32(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 24(t=%rsi +movq 24(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 16(t=%rsi +movq 16(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 8(t=%rsi +movq 8(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#1 +# asm 2: movq 0(t=%rdi +movq 0(%rdi),%rdi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rdi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE348864F_VEC_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece348864f/vec/api.h b/crypto_kem/mceliece348864f/vec/api.h new file mode 100644 index 00000000..e5fa7b1b --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/api.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_API_H +#define PQCLEAN_MCELIECE348864F_VEC_API_H + +#include + +#define PQCLEAN_MCELIECE348864F_VEC_CRYPTO_ALGNAME "Classic McEliece 348864f" +#define PQCLEAN_MCELIECE348864F_VEC_CRYPTO_PUBLICKEYBYTES 261120 +#define PQCLEAN_MCELIECE348864F_VEC_CRYPTO_SECRETKEYBYTES 6452 +#define PQCLEAN_MCELIECE348864F_VEC_CRYPTO_CIPHERTEXTBYTES 128 +#define PQCLEAN_MCELIECE348864F_VEC_CRYPTO_BYTES 32 + + + +int PQCLEAN_MCELIECE348864F_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE348864F_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE348864F_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece348864f/vec/benes.c b/crypto_kem/mceliece348864f/vec/benes.c new file mode 100644 index 00000000..254f8a85 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/benes.c @@ -0,0 +1,95 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +/* one layer of the benes network */ +static void layer(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE348864F_VEC_benes(uint64_t *r, const unsigned char *bits, int rev) { + int i; + + const unsigned char *cond_ptr; + int inc, low; + + uint64_t cond[64]; + + // + + if (rev == 0) { + inc = 256; + cond_ptr = bits; + } else { + inc = -256; + cond_ptr = bits + (2 * GFBITS - 2) * 256; + } + + // + + PQCLEAN_MCELIECE348864F_VEC_transpose_64x64(r, r); + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_VEC_load4(cond_ptr + i * 4); + } + PQCLEAN_MCELIECE348864F_VEC_transpose_64x64(cond, cond); + layer(r, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864F_VEC_transpose_64x64(r, r); + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 32; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_VEC_load8(cond_ptr + i * 8); + } + layer(r, cond, low); + cond_ptr += inc; + } + for (low = 4; low >= 0; low--) { + for (i = 0; i < 32; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_VEC_load8(cond_ptr + i * 8); + } + layer(r, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864F_VEC_transpose_64x64(r, r); + + for (low = 5; low >= 0; low--) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_VEC_load4(cond_ptr + i * 4); + } + PQCLEAN_MCELIECE348864F_VEC_transpose_64x64(cond, cond); + layer(r, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864F_VEC_transpose_64x64(r, r); +} + diff --git a/crypto_kem/mceliece348864f/vec/benes.h b/crypto_kem/mceliece348864f/vec/benes.h new file mode 100644 index 00000000..51fdd3ea --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/benes.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_BENES_H +#define PQCLEAN_MCELIECE348864F_VEC_BENES_H +/* + This file is for Benes network related functions +*/ + +#include + +void PQCLEAN_MCELIECE348864F_VEC_benes(uint64_t * /*r*/, const unsigned char * /*bits*/, int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/vec/bm.c b/crypto_kem/mceliece348864f/vec/bm.c new file mode 100644 index 00000000..281864e0 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/bm.c @@ -0,0 +1,247 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "util.h" + +#include + +static inline uint64_t mask_nonzero(gf a) { + uint64_t ret = a; + + ret -= 1; + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline uint64_t mask_leq(uint16_t a, uint16_t b) { + uint64_t a_tmp = a; + uint64_t b_tmp = b; + uint64_t ret = b_tmp - a_tmp; + + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline void vec_cmov(vec *out, const vec *in, uint16_t mask) { + int i; + + vec m0, m1; + + m0 = PQCLEAN_MCELIECE348864F_VEC_vec_set1_16b(mask); + m1 = ~m0; + + for (i = 0; i < GFBITS; i++) { + out[i] = (in[i] & m0) | (out[i] & m1); + out[i] = (in[i] & m0) | (out[i] & m1); + } +} + +static inline void interleave(vec *in, int idx0, int idx1, const vec *mask, int b) { + int s = 1 << b; + + vec x, y; + + x = (in[idx0] & mask[0]) | ((in[idx1] & mask[0]) << s); + y = ((in[idx0] & mask[1]) >> s) | (in[idx1] & mask[1]); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, const vec *in) { + int i, k; + + vec mask[4][2]; + vec buf[16]; + + for (i = 0; i < GFBITS; i++) { + buf[i] = in[i]; + } + for (i = GFBITS; i < 16; i++) { + buf[i] = 0; + } + + mask[0][0] = PQCLEAN_MCELIECE348864F_VEC_vec_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE348864F_VEC_vec_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE348864F_VEC_vec_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE348864F_VEC_vec_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE348864F_VEC_vec_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE348864F_VEC_vec_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE348864F_VEC_vec_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE348864F_VEC_vec_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ k * 16 + i ] = (buf[i] >> (k * 16)) & GFMASK; + } + } +} + +static inline gf vec_reduce(const vec *in) { + int i; + vec tmp; + gf ret = 0; + + for (i = GFBITS - 1; i >= 0; i--) { + tmp = in[i]; + + tmp ^= tmp >> 32; + tmp ^= tmp >> 16; + tmp ^= tmp >> 8; + tmp ^= tmp >> 4; + tmp ^= tmp >> 2; + tmp ^= tmp >> 1; + + ret <<= 1; + ret |= tmp & 1; + } + + return ret; +} + +static void update(vec *in, const gf e) { + int i; + vec tmp; + + for (i = 0; i < GFBITS; i++) { + tmp = (e >> i) & 1; + + in[i] = (in[i] >> 1) | (tmp << 63); + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE348864F_VEC_bm(vec *out, vec in[][ GFBITS ]) { + uint16_t i; + uint16_t N, L; + + vec prod[ GFBITS ]; + vec in_tmp[ GFBITS ]; + + vec d_vec[ GFBITS ]; + vec b_vec[ GFBITS ]; + vec B[ GFBITS ], C[ GFBITS ]; + vec B_tmp[ GFBITS ], C_tmp[ GFBITS ]; + + vec mask, t; + + gf d, b, c0 = 1; + + gf coefs[SYS_T * 2]; + + // init + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[ 64], in[1]); + + C[0] = 0; + B[0] = 1; + B[0] <<= 63; + + for (i = 1; i < GFBITS; i++) { + B[i] = C[i] = 0; + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + in_tmp[i] = 0; + } + + for (N = 0; N < SYS_T * 2; N++) { + // computing d + + PQCLEAN_MCELIECE348864F_VEC_vec_mul(prod, in_tmp, C); + + update(in_tmp, coefs[N]); + + d = vec_reduce(prod); + + t = PQCLEAN_MCELIECE348864F_VEC_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + // 3 cases + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + d_vec[i] = PQCLEAN_MCELIECE348864F_VEC_vec_setbits((d >> i) & 1); + b_vec[i] = PQCLEAN_MCELIECE348864F_VEC_vec_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE348864F_VEC_vec_mul(B_tmp, d_vec, B); + PQCLEAN_MCELIECE348864F_VEC_vec_mul(C_tmp, b_vec, C); + + vec_cmov(B, C, (uint16_t)mask); + update(B, mask & c0); + + for (i = 0; i < GFBITS; i++) { + C[i] = B_tmp[i] ^ C_tmp[i]; + } + + c0 = (gf)(t >> 32); + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + + } + + c0 = PQCLEAN_MCELIECE348864F_VEC_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + out[i] = PQCLEAN_MCELIECE348864F_VEC_vec_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE348864F_VEC_vec_mul(out, out, C); +} + diff --git a/crypto_kem/mceliece348864f/vec/bm.h b/crypto_kem/mceliece348864f/vec/bm.h new file mode 100644 index 00000000..27e4369d --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/bm.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_BM_H +#define PQCLEAN_MCELIECE348864F_VEC_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE348864F_VEC_bm(vec * /*out*/, vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece348864f/vec/consts.inc b/crypto_kem/mceliece348864f/vec/consts.inc new file mode 100644 index 00000000..a728344f --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/consts.inc @@ -0,0 +1,888 @@ +//64 +{ + 0XF00F0FF0F00F0FF0, + 0XF0F00F0F0F0FF0F0, + 0X0FF00FF00FF00FF0, + 0XAA5555AAAA5555AA, + 0XF00F0FF0F00F0FF0, + 0X33CCCC33CC3333CC, + 0XFFFF0000FFFF0000, + 0XCC33CC3333CC33CC, + 0X33CC33CC33CC33CC, + 0X5A5A5A5A5A5A5A5A, + 0XFF00FF00FF00FF00, + 0XF00F0FF0F00F0FF0, +}, +//128 +{ + 0X3C3C3C3C3C3C3C3C, + 0XF0F0F0F0F0F0F0F0, + 0X5555AAAA5555AAAA, + 0XCC3333CCCC3333CC, + 0XC33CC33CC33CC33C, + 0X55555555AAAAAAAA, + 0X33333333CCCCCCCC, + 0X00FF00FFFF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0X0000000000000000, + 0X0000FFFFFFFF0000, + 0XF0F00F0F0F0FF0F0, +}, +{ + 0X3C3C3C3C3C3C3C3C, + 0X0F0F0F0F0F0F0F0F, + 0XAAAA5555AAAA5555, + 0XCC3333CCCC3333CC, + 0XC33CC33CC33CC33C, + 0X55555555AAAAAAAA, + 0X33333333CCCCCCCC, + 0XFF00FF0000FF00FF, + 0X0F0F0F0F0F0F0F0F, + 0X0000000000000000, + 0X0000FFFFFFFF0000, + 0XF0F00F0F0F0FF0F0, +}, +//256 +{ + 0XAA55AA5555AA55AA, + 0XCC33CC3333CC33CC, + 0X33CCCC33CC3333CC, + 0X55555555AAAAAAAA, + 0XFF0000FF00FFFF00, + 0X3CC33CC3C33CC33C, + 0X5555AAAA5555AAAA, + 0X0FF00FF00FF00FF0, + 0XCCCC33333333CCCC, + 0XF0F0F0F0F0F0F0F0, + 0X00FFFF0000FFFF00, + 0XC33CC33CC33CC33C, +}, +{ + 0X55AA55AAAA55AA55, + 0XCC33CC3333CC33CC, + 0XCC3333CC33CCCC33, + 0X55555555AAAAAAAA, + 0XFF0000FF00FFFF00, + 0XC33CC33C3CC33CC3, + 0XAAAA5555AAAA5555, + 0XF00FF00FF00FF00F, + 0X3333CCCCCCCC3333, + 0X0F0F0F0F0F0F0F0F, + 0XFF0000FFFF0000FF, + 0XC33CC33CC33CC33C, +}, +{ + 0XAA55AA5555AA55AA, + 0X33CC33CCCC33CC33, + 0XCC3333CC33CCCC33, + 0X55555555AAAAAAAA, + 0X00FFFF00FF0000FF, + 0X3CC33CC3C33CC33C, + 0X5555AAAA5555AAAA, + 0X0FF00FF00FF00FF0, + 0X3333CCCCCCCC3333, + 0XF0F0F0F0F0F0F0F0, + 0X00FFFF0000FFFF00, + 0XC33CC33CC33CC33C, +}, +{ + 0X55AA55AAAA55AA55, + 0X33CC33CCCC33CC33, + 0X33CCCC33CC3333CC, + 0X55555555AAAAAAAA, + 0X00FFFF00FF0000FF, + 0XC33CC33C3CC33CC3, + 0XAAAA5555AAAA5555, + 0XF00FF00FF00FF00F, + 0XCCCC33333333CCCC, + 0X0F0F0F0F0F0F0F0F, + 0XFF0000FFFF0000FF, + 0XC33CC33CC33CC33C, +}, +//512 +{ + 0X6699669999669966, + 0X33CCCC33CC3333CC, + 0XA5A5A5A55A5A5A5A, + 0X3C3CC3C3C3C33C3C, + 0XF00FF00F0FF00FF0, + 0X55AA55AA55AA55AA, + 0X3C3CC3C3C3C33C3C, + 0X0F0F0F0FF0F0F0F0, + 0X55AA55AA55AA55AA, + 0X33CCCC33CC3333CC, + 0XF0F0F0F0F0F0F0F0, + 0XA55A5AA55AA5A55A, +}, +{ + 0X9966996666996699, + 0X33CCCC33CC3333CC, + 0XA5A5A5A55A5A5A5A, + 0X3C3CC3C3C3C33C3C, + 0X0FF00FF0F00FF00F, + 0XAA55AA55AA55AA55, + 0X3C3CC3C3C3C33C3C, + 0XF0F0F0F00F0F0F0F, + 0XAA55AA55AA55AA55, + 0XCC3333CC33CCCC33, + 0X0F0F0F0F0F0F0F0F, + 0XA55A5AA55AA5A55A, +}, +{ + 0X6699669999669966, + 0X33CCCC33CC3333CC, + 0X5A5A5A5AA5A5A5A5, + 0XC3C33C3C3C3CC3C3, + 0X0FF00FF0F00FF00F, + 0XAA55AA55AA55AA55, + 0XC3C33C3C3C3CC3C3, + 0X0F0F0F0FF0F0F0F0, + 0XAA55AA55AA55AA55, + 0X33CCCC33CC3333CC, + 0XF0F0F0F0F0F0F0F0, + 0XA55A5AA55AA5A55A, +}, +{ + 0X9966996666996699, + 0X33CCCC33CC3333CC, + 0X5A5A5A5AA5A5A5A5, + 0XC3C33C3C3C3CC3C3, + 0XF00FF00F0FF00FF0, + 0X55AA55AA55AA55AA, + 0XC3C33C3C3C3CC3C3, + 0XF0F0F0F00F0F0F0F, + 0X55AA55AA55AA55AA, + 0XCC3333CC33CCCC33, + 0X0F0F0F0F0F0F0F0F, + 0XA55A5AA55AA5A55A, +}, +{ + 0X6699669999669966, + 0XCC3333CC33CCCC33, + 0X5A5A5A5AA5A5A5A5, + 0X3C3CC3C3C3C33C3C, + 0X0FF00FF0F00FF00F, + 0X55AA55AA55AA55AA, + 0X3C3CC3C3C3C33C3C, + 0X0F0F0F0FF0F0F0F0, + 0X55AA55AA55AA55AA, + 0X33CCCC33CC3333CC, + 0XF0F0F0F0F0F0F0F0, + 0XA55A5AA55AA5A55A, +}, +{ + 0X9966996666996699, + 0XCC3333CC33CCCC33, + 0X5A5A5A5AA5A5A5A5, + 0X3C3CC3C3C3C33C3C, + 0XF00FF00F0FF00FF0, + 0XAA55AA55AA55AA55, + 0X3C3CC3C3C3C33C3C, + 0XF0F0F0F00F0F0F0F, + 0XAA55AA55AA55AA55, + 0XCC3333CC33CCCC33, + 0X0F0F0F0F0F0F0F0F, + 0XA55A5AA55AA5A55A, +}, +{ + 0X6699669999669966, + 0XCC3333CC33CCCC33, + 0XA5A5A5A55A5A5A5A, + 0XC3C33C3C3C3CC3C3, + 0XF00FF00F0FF00FF0, + 0XAA55AA55AA55AA55, + 0XC3C33C3C3C3CC3C3, + 0X0F0F0F0FF0F0F0F0, + 0XAA55AA55AA55AA55, + 0X33CCCC33CC3333CC, + 0XF0F0F0F0F0F0F0F0, + 0XA55A5AA55AA5A55A, +}, +{ + 0X9966996666996699, + 0XCC3333CC33CCCC33, + 0XA5A5A5A55A5A5A5A, + 0XC3C33C3C3C3CC3C3, + 0X0FF00FF0F00FF00F, + 0X55AA55AA55AA55AA, + 0XC3C33C3C3C3CC3C3, + 0XF0F0F0F00F0F0F0F, + 0X55AA55AA55AA55AA, + 0XCC3333CC33CCCC33, + 0X0F0F0F0F0F0F0F0F, + 0XA55A5AA55AA5A55A, +}, +//1024 +{ + 0X9669699696696996, + 0X6996699669966996, + 0X6996699669966996, + 0X00FFFF0000FFFF00, + 0XFF00FF00FF00FF00, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X6996699669966996, + 0X00FFFF0000FFFF00, + 0X00FF00FF00FF00FF, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X6996699669966996, + 0XFF0000FFFF0000FF, + 0X00FF00FF00FF00FF, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X6996699669966996, + 0XFF0000FFFF0000FF, + 0XFF00FF00FF00FF00, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X9669966996699669, + 0XFF0000FFFF0000FF, + 0X00FF00FF00FF00FF, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X9669966996699669, + 0XFF0000FFFF0000FF, + 0XFF00FF00FF00FF00, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X9669966996699669, + 0X00FFFF0000FFFF00, + 0XFF00FF00FF00FF00, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X6996699669966996, + 0X9669966996699669, + 0X00FFFF0000FFFF00, + 0X00FF00FF00FF00FF, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X9669966996699669, + 0X00FFFF0000FFFF00, + 0XFF00FF00FF00FF00, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X9669966996699669, + 0X00FFFF0000FFFF00, + 0X00FF00FF00FF00FF, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X9669966996699669, + 0XFF0000FFFF0000FF, + 0X00FF00FF00FF00FF, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X9669966996699669, + 0XFF0000FFFF0000FF, + 0XFF00FF00FF00FF00, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X6996699669966996, + 0XFF0000FFFF0000FF, + 0X00FF00FF00FF00FF, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X6996699669966996, + 0XFF0000FFFF0000FF, + 0XFF00FF00FF00FF00, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X6996699669966996, + 0X00FFFF0000FFFF00, + 0XFF00FF00FF00FF00, + 0X0FF00FF0F00FF00F, + 0X0F0FF0F0F0F00F0F, + 0XC33C3CC33CC3C33C, + 0XC33C3CC33CC3C33C, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +{ + 0X9669699696696996, + 0X9669966996699669, + 0X6996699669966996, + 0X00FFFF0000FFFF00, + 0X00FF00FF00FF00FF, + 0XF00FF00F0FF00FF0, + 0XF0F00F0F0F0FF0F0, + 0X3CC3C33CC33C3CC3, + 0X3CC3C33CC33C3CC3, + 0XA55A5AA55AA5A55A, + 0XC33C3CC33CC3C33C, + 0X3CC3C33C3CC3C33C, +}, +//2048 +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFF0000FFFF0000, + 0XFF00FF00FF00FF00, + 0XF0F0F0F0F0F0F0F0, + 0XCCCCCCCCCCCCCCCC, + 0XAAAAAAAAAAAAAAAA, +} diff --git a/crypto_kem/mceliece348864f/vec/controlbits.c b/crypto_kem/mceliece348864f/vec/controlbits.c new file mode 100644 index 00000000..93b268b2 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE348864F_VEC_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE348864F_VEC_sort_63b(n / 2, x); + PQCLEAN_MCELIECE348864F_VEC_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE348864F_VEC_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece348864f/vec/controlbits.h b/crypto_kem/mceliece348864f/vec/controlbits.h new file mode 100644 index 00000000..9ea0e223 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_CONTROLBITS_H +#define PQCLEAN_MCELIECE348864F_VEC_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE348864F_VEC_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE348864F_VEC_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece348864f/vec/crypto_hash.h b/crypto_kem/mceliece348864f/vec/crypto_hash.h new file mode 100644 index 00000000..0a147cbb --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE348864F_VEC_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece348864f/vec/decrypt.c b/crypto_kem/mceliece348864f/vec/decrypt.c new file mode 100644 index 00000000..342276c0 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/decrypt.c @@ -0,0 +1,191 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" +#include "vec.h" + +#include + +static void scaling(vec out[][GFBITS], vec inv[][GFBITS], const unsigned char *sk, const vec *recv) { + int i, j; + + vec irr_int[ GFBITS ]; + vec eval[64][ GFBITS ]; + vec tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE348864F_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE348864F_VEC_fft(eval, irr_int); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE348864F_VEC_vec_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE348864F_VEC_vec_copy(inv[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE348864F_VEC_vec_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864F_VEC_vec_inv(tmp, inv[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE348864F_VEC_vec_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE348864F_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864F_VEC_vec_copy(inv[0], tmp); + + // + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static void preprocess(vec *recv, const unsigned char *s) { + int i; + unsigned char r[ 512 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 512; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE348864F_VEC_load8(r + i * 8); + } +} + +static void postprocess(unsigned char *e, vec *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE348864F_VEC_store8(error8 + i * 8, err[i]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec out[][GFBITS], vec inv[][GFBITS], const vec *recv) { + int i, j; + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static uint16_t weight_check(const unsigned char *e, const vec *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < (1 << GFBITS); i++) { + w0 += (error[i / 64] >> (i % 64)) & 1; + } + + for (i = 0; i < SYS_N; i++) { + w1 += (e[i / 8] >> (i % 8)) & 1; + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec s0[][ GFBITS ], vec s1[][ GFBITS ]) { + int i, j; + vec diff = 0; + + for (i = 0; i < 2; i++) { + for (j = 0; j < GFBITS; j++) { + diff |= (s0[i][j] ^ s1[i][j]); + } + } + + return (uint16_t)PQCLEAN_MCELIECE348864F_VEC_vec_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE348864F_VEC_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec inv[ 64 ][ GFBITS ]; + vec scaled[ 64 ][ GFBITS ]; + vec eval[ 64 ][ GFBITS ]; + + vec error[ 64 ]; + + vec s_priv[ 2 ][ GFBITS ]; + vec s_priv_cmp[ 2 ][ GFBITS ]; + vec locator[ GFBITS ]; + + vec recv[ 64 ]; + vec allone; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE348864F_VEC_benes(recv, sk + IRR_BYTES, 1); + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE348864F_VEC_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE348864F_VEC_bm(locator, s_priv); + + PQCLEAN_MCELIECE348864F_VEC_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE348864F_VEC_vec_setbits(1); + + for (i = 0; i < 64; i++) { + error[i] = PQCLEAN_MCELIECE348864F_VEC_vec_or_reduce(eval[i]); + error[i] ^= allone; + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE348864F_VEC_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE348864F_VEC_benes(error, sk + IRR_BYTES, 0); + + postprocess(e, error); + + check_weight = weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece348864f/vec/decrypt.h b/crypto_kem/mceliece348864f/vec/decrypt.h new file mode 100644 index 00000000..17f1e19b --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_DECRYPT_H +#define PQCLEAN_MCELIECE348864F_VEC_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE348864F_VEC_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/vec/encrypt.c b/crypto_kem/mceliece348864f/vec/encrypt.c new file mode 100644 index 00000000..519e268a --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/encrypt.c @@ -0,0 +1,135 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "gf.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint8_t *ind_8 = (uint8_t *)ind_; + uint16_t ind[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes(ind_8, sizeof(ind_)); + for (i = 0; i < sizeof(ind_); i += 2) { + ind_[i / 2] = (uint16_t)ind_8[i + 1] << 8 | ind_8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE348864F_VEC_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + uint64_t b; + + const uint8_t *e_ptr8 = e + SYND_BYTES; + const uint8_t *pk_ptr8; + + int i, j; + + // + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = e[i]; + } + + for (i = 0; i < PK_NROWS; i++) { + pk_ptr8 = pk + PK_ROW_BYTES * i; + + b = 0; + for (j = 0; j < PK_NCOLS / 64; j++) { + b ^= PQCLEAN_MCELIECE348864F_VEC_load8(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE348864F_VEC_load8(e_ptr8 + 8 * j); + } + + b ^= PQCLEAN_MCELIECE348864F_VEC_load4(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE348864F_VEC_load4(e_ptr8 + 8 * j); + + b ^= b >> 32; + b ^= b >> 16; + b ^= b >> 8; + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] ^= (b << (i % 8)); + } +} + +void PQCLEAN_MCELIECE348864F_VEC_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece348864f/vec/encrypt.h b/crypto_kem/mceliece348864f/vec/encrypt.h new file mode 100644 index 00000000..6165b324 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_ENCRYPT_H +#define PQCLEAN_MCELIECE348864F_VEC_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE348864F_VEC_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/vec/fft.c b/crypto_kem/mceliece348864f/vec/fft.c new file mode 100644 index 00000000..3fe9fa91 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/fft.c @@ -0,0 +1,113 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" +#include "vec.h" + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(uint64_t *in) { + int i, j, k; + + const uint64_t mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const uint64_t s[5][GFBITS] = { +#include "scalars.inc" + }; + + // + + for (j = 0; j <= 4; j++) { + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[i] ^= (in[i] & mask[k][0]) >> (1 << k); + in[i] ^= (in[i] & mask[k][1]) >> (1 << k); + } + } + + PQCLEAN_MCELIECE348864F_VEC_vec_mul(in, in, s[j]); // scaling + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(uint64_t out[][ GFBITS ], const uint64_t *in) { + int i, j, k, s, b; + + uint64_t tmp[ GFBITS ]; + uint64_t consts[ 63 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec powers[ 64 ][ GFBITS ] = { +#include "powers.inc" + }; + + uint64_t consts_ptr = 0; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + // boradcast + + for (j = 0; j < 64; j++) { + for (i = 0; i < GFBITS; i++) { + out[j][i] = (in[i] >> reversal[j]) & 1; + out[j][i] = -out[j][i]; + } + } + + // butterflies + + for (i = 0; i <= 5; i++) { + s = 1 << i; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE348864F_VEC_vec_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += ((uint64_t)1 << i); + } + + // + + // adding the part contributed by x^64 + + for (i = 0; i < 64; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] ^= powers[i][b]; + } + } + +} + +void PQCLEAN_MCELIECE348864F_VEC_fft(vec out[][ GFBITS ], vec *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece348864f/vec/fft.h b/crypto_kem/mceliece348864f/vec/fft.h new file mode 100644 index 00000000..b0eb29da --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/fft.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_FFT_H +#define PQCLEAN_MCELIECE348864F_VEC_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "params.h" +#include "vec.h" +#include + +void PQCLEAN_MCELIECE348864F_VEC_fft(vec /*out*/[][ GFBITS ], vec * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/vec/fft_tr.c b/crypto_kem/mceliece348864f/vec/fft_tr.c new file mode 100644 index 00000000..2d99d52c --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/fft_tr.c @@ -0,0 +1,268 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" +#include "vec.h" + +#include + +#define vec_add(z, x, y) \ + for (b = 0; b < GFBITS; b++) { \ + (z)[b] = (x)[b] ^ (y)[b]; \ + } + +static inline void radix_conversions_tr(uint64_t in[][ GFBITS ]) { + int i, j, k; + + const uint64_t mask[6][2] = { + {0x2222222222222222, 0x4444444444444444}, + {0x0C0C0C0C0C0C0C0C, 0x3030303030303030}, + {0x00F000F000F000F0, 0x0F000F000F000F00}, + {0x0000FF000000FF00, 0x00FF000000FF0000}, + {0x00000000FFFF0000, 0x0000FFFF00000000}, + {0xFFFFFFFF00000000, 0x00000000FFFFFFFF} + }; + + const uint64_t s[5][2][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 5; j >= 0; j--) { + if (j < 5) { + PQCLEAN_MCELIECE348864F_VEC_vec_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE348864F_VEC_vec_mul(in[1], in[1], s[j][1]); // scaling + } + + for (i = 0; i < GFBITS; i++) { + for (k = j; k <= 4; k++) { + in[0][i] ^= (in[0][i] & mask[k][0]) << (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) << (1 << k); + + in[1][i] ^= (in[1][i] & mask[k][0]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) << (1 << k); + } + } + + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= (in[0][i] & mask[5][0]) >> 32; + in[1][i] ^= (in[1][i] & mask[5][1]) << 32; + } + } +} + +static inline void butterflies_tr(uint64_t out[][ GFBITS ], uint64_t in[][ GFBITS ]) { + int i, j, k, s, b; + + uint64_t tmp[ GFBITS ]; + uint64_t pre[6][ GFBITS ]; + uint64_t buf[64]; + + const uint64_t consts[ 63 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 63; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {8, 1300, 3408, 1354, 2341, 1154}; + + // butterflies + + for (i = 5; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + vec_add(in[k], in[k], in[k + s]); + PQCLEAN_MCELIECE348864F_VEC_vec_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + vec_add(in[k + s], in[k + s], tmp); + } + } + } + + // transpose + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < 64; j++) { + buf[ reversal[j] ] = in[j][i]; + } + + PQCLEAN_MCELIECE348864F_VEC_transpose_64x64(buf, buf); + + for (j = 0; j < 64; j++) { + in[j][i] = buf[ j ]; + } + } + + // boradcast + + PQCLEAN_MCELIECE348864F_VEC_vec_copy(pre[0], in[32]); + vec_add(in[33], in[33], in[32]); + PQCLEAN_MCELIECE348864F_VEC_vec_copy(pre[1], in[33]); + vec_add(in[35], in[35], in[33]); + vec_add(pre[0], pre[0], in[35]); + vec_add(in[34], in[34], in[35]); + PQCLEAN_MCELIECE348864F_VEC_vec_copy(pre[2], in[34]); + vec_add(in[38], in[38], in[34]); + vec_add(pre[0], pre[0], in[38]); + vec_add(in[39], in[39], in[38]); + vec_add(pre[1], pre[1], in[39]); + vec_add(in[37], in[37], in[39]); + vec_add(pre[0], pre[0], in[37]); + vec_add(in[36], in[36], in[37]); + PQCLEAN_MCELIECE348864F_VEC_vec_copy(pre[3], in[36]); + vec_add(in[44], in[44], in[36]); + vec_add(pre[0], pre[0], in[44]); + vec_add(in[45], in[45], in[44]); + vec_add(pre[1], pre[1], in[45]); + vec_add(in[47], in[47], in[45]); + vec_add(pre[0], pre[0], in[47]); + vec_add(in[46], in[46], in[47]); + vec_add(pre[2], pre[2], in[46]); + vec_add(in[42], in[42], in[46]); + vec_add(pre[0], pre[0], in[42]); + vec_add(in[43], in[43], in[42]); + vec_add(pre[1], pre[1], in[43]); + vec_add(in[41], in[41], in[43]); + vec_add(pre[0], pre[0], in[41]); + vec_add(in[40], in[40], in[41]); + PQCLEAN_MCELIECE348864F_VEC_vec_copy(pre[4], in[40]); + vec_add(in[56], in[56], in[40]); + vec_add(pre[0], pre[0], in[56]); + vec_add(in[57], in[57], in[56]); + vec_add(pre[1], pre[1], in[57]); + vec_add(in[59], in[59], in[57]); + vec_add(pre[0], pre[0], in[59]); + vec_add(in[58], in[58], in[59]); + vec_add(pre[2], pre[2], in[58]); + vec_add(in[62], in[62], in[58]); + vec_add(pre[0], pre[0], in[62]); + vec_add(in[63], in[63], in[62]); + vec_add(pre[1], pre[1], in[63]); + vec_add(in[61], in[61], in[63]); + vec_add(pre[0], pre[0], in[61]); + vec_add(in[60], in[60], in[61]); + vec_add(pre[3], pre[3], in[60]); + vec_add(in[52], in[52], in[60]); + vec_add(pre[0], pre[0], in[52]); + vec_add(in[53], in[53], in[52]); + vec_add(pre[1], pre[1], in[53]); + vec_add(in[55], in[55], in[53]); + vec_add(pre[0], pre[0], in[55]); + vec_add(in[54], in[54], in[55]); + vec_add(pre[2], pre[2], in[54]); + vec_add(in[50], in[50], in[54]); + vec_add(pre[0], pre[0], in[50]); + vec_add(in[51], in[51], in[50]); + vec_add(pre[1], pre[1], in[51]); + vec_add(in[49], in[49], in[51]); + vec_add(pre[0], pre[0], in[49]); + vec_add(in[48], in[48], in[49]); + PQCLEAN_MCELIECE348864F_VEC_vec_copy(pre[5], in[48]); + vec_add(in[16], in[16], in[48]); + vec_add(pre[0], pre[0], in[16]); + vec_add(in[17], in[17], in[16]); + vec_add(pre[1], pre[1], in[17]); + vec_add(in[19], in[19], in[17]); + vec_add(pre[0], pre[0], in[19]); + vec_add(in[18], in[18], in[19]); + vec_add(pre[2], pre[2], in[18]); + vec_add(in[22], in[22], in[18]); + vec_add(pre[0], pre[0], in[22]); + vec_add(in[23], in[23], in[22]); + vec_add(pre[1], pre[1], in[23]); + vec_add(in[21], in[21], in[23]); + vec_add(pre[0], pre[0], in[21]); + vec_add(in[20], in[20], in[21]); + vec_add(pre[3], pre[3], in[20]); + vec_add(in[28], in[28], in[20]); + vec_add(pre[0], pre[0], in[28]); + vec_add(in[29], in[29], in[28]); + vec_add(pre[1], pre[1], in[29]); + vec_add(in[31], in[31], in[29]); + vec_add(pre[0], pre[0], in[31]); + vec_add(in[30], in[30], in[31]); + vec_add(pre[2], pre[2], in[30]); + vec_add(in[26], in[26], in[30]); + vec_add(pre[0], pre[0], in[26]); + vec_add(in[27], in[27], in[26]); + vec_add(pre[1], pre[1], in[27]); + vec_add(in[25], in[25], in[27]); + vec_add(pre[0], pre[0], in[25]); + vec_add(in[24], in[24], in[25]); + vec_add(pre[4], pre[4], in[24]); + vec_add(in[8], in[8], in[24]); + vec_add(pre[0], pre[0], in[8]); + vec_add(in[9], in[9], in[8]); + vec_add(pre[1], pre[1], in[9]); + vec_add(in[11], in[11], in[9]); + vec_add(pre[0], pre[0], in[11]); + vec_add(in[10], in[10], in[11]); + vec_add(pre[2], pre[2], in[10]); + vec_add(in[14], in[14], in[10]); + vec_add(pre[0], pre[0], in[14]); + vec_add(in[15], in[15], in[14]); + vec_add(pre[1], pre[1], in[15]); + vec_add(in[13], in[13], in[15]); + vec_add(pre[0], pre[0], in[13]); + vec_add(in[12], in[12], in[13]); + vec_add(pre[3], pre[3], in[12]); + vec_add(in[4], in[4], in[12]); + vec_add(pre[0], pre[0], in[4]); + vec_add(in[5], in[5], in[4]); + vec_add(pre[1], pre[1], in[5]); + vec_add(in[7], in[7], in[5]); + vec_add(pre[0], pre[0], in[7]); + vec_add(in[6], in[6], in[7]); + vec_add(pre[2], pre[2], in[6]); + vec_add(in[2], in[2], in[6]); + vec_add(pre[0], pre[0], in[2]); + vec_add(in[3], in[3], in[2]); + vec_add(pre[1], pre[1], in[3]); + vec_add(in[1], in[1], in[3]); + + vec_add(pre[0], pre[0], in[1]); + vec_add(out[0], in[0], in[1]); + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = (beta[0] >> j) & 1; + tmp[j] = -tmp[j]; + } + + PQCLEAN_MCELIECE348864F_VEC_vec_mul(out[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = (beta[i] >> j) & 1; + tmp[j] = -tmp[j]; + } + + PQCLEAN_MCELIECE348864F_VEC_vec_mul(tmp, pre[i], tmp); + vec_add(out[1], out[1], tmp); + } +} + +void PQCLEAN_MCELIECE348864F_VEC_fft_tr(vec out[][GFBITS], vec in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece348864f/vec/fft_tr.h b/crypto_kem/mceliece348864f/vec/fft_tr.h new file mode 100644 index 00000000..7135dd2c --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_FFT_TR_H +#define PQCLEAN_MCELIECE348864F_VEC_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE348864F_VEC_fft_tr(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece348864f/vec/gf.c b/crypto_kem/mceliece348864f/vec/gf.c new file mode 100644 index 00000000..024ff7f8 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/gf.c @@ -0,0 +1,169 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE348864F_VEC_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 20; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE348864F_VEC_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE348864F_VEC_gf_mul(gf in0, gf in1) { + int i; + + uint32_t tmp; + uint32_t t0; + uint32_t t1; + uint32_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & (1 << i))); + } + + t = tmp & 0x7FC000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + t = tmp & 0x3000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + return tmp & ((1 << GFBITS) - 1); +} + +/* input: field element in */ +/* return: in^2 */ +static inline gf gf_sq(gf in) { + const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; + + uint32_t x = in; + uint32_t t; + + x = (x | (x << 8)) & B[3]; + x = (x | (x << 4)) & B[2]; + x = (x | (x << 2)) & B[1]; + x = (x | (x << 1)) & B[0]; + + t = x & 0x7FC000; + x ^= t >> 9; + x ^= t >> 12; + + t = x & 0x3000; + x ^= t >> 9; + x ^= t >> 12; + + return x & ((1 << GFBITS) - 1); +} + +gf PQCLEAN_MCELIECE348864F_VEC_gf_inv(gf in) { + gf tmp_11; + gf tmp_1111; + + gf out = in; + + out = gf_sq(out); + tmp_11 = PQCLEAN_MCELIECE348864F_VEC_gf_mul(out, in); // 11 + + out = gf_sq(tmp_11); + out = gf_sq(out); + tmp_1111 = PQCLEAN_MCELIECE348864F_VEC_gf_mul(out, tmp_11); // 1111 + + out = gf_sq(tmp_1111); + out = gf_sq(out); + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_VEC_gf_mul(out, tmp_1111); // 11111111 + + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_VEC_gf_mul(out, tmp_11); // 1111111111 + + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_VEC_gf_mul(out, in); // 11111111111 + + return gf_sq(out); // 111111111110 +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE348864F_VEC_gf_frac(gf den, gf num) { + return PQCLEAN_MCELIECE348864F_VEC_gf_mul(PQCLEAN_MCELIECE348864F_VEC_gf_inv(den), num); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE348864F_VEC_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE348864F_VEC_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864F_VEC_gf_mul(prod[i], (gf) 877); + prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864F_VEC_gf_mul(prod[i], (gf) 2888); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864F_VEC_gf_mul(prod[i], (gf) 1781); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864F_VEC_gf_mul(prod[i], (gf) 373); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864F_VEC_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x007FC000007FC000; + tmp ^= (t >> 9) ^ (t >> 12); + + t = tmp & 0x0000300000003000; + tmp ^= (t >> 9) ^ (t >> 12); + + return tmp & 0x00000FFF00000FFF; +} + diff --git a/crypto_kem/mceliece348864f/vec/gf.h b/crypto_kem/mceliece348864f/vec/gf.h new file mode 100644 index 00000000..c65ad778 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/gf.h @@ -0,0 +1,26 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_GF_H +#define PQCLEAN_MCELIECE348864F_VEC_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE348864F_VEC_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE348864F_VEC_gf_add(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864F_VEC_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864F_VEC_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE348864F_VEC_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE348864F_VEC_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864F_VEC_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/crypto_kem/mceliece348864f/vec/operations.c b/crypto_kem/mceliece348864f/vec/operations.c new file mode 100644 index 00000000..a54005d8 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE348864F_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE348864F_VEC_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE348864F_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864F_VEC_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE348864F_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE348864F_VEC_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE348864F_VEC_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE348864F_VEC_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE348864F_VEC_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE348864F_VEC_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE348864F_VEC_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE348864F_VEC_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE348864F_VEC_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864f/vec/params.h b/crypto_kem/mceliece348864f/vec/params.h new file mode 100644 index 00000000..8f89f089 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_PARAMS_H +#define PQCLEAN_MCELIECE348864F_VEC_PARAMS_H + +#define GFBITS 12 +#define SYS_N 3488 +#define SYS_T 64 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece348864f/vec/pk_gen.c b/crypto_kem/mceliece348864f/vec/pk_gen.c new file mode 100644 index 00000000..b47c27a5 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/pk_gen.c @@ -0,0 +1,317 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec in[][GFBITS]) { + int i, j, r; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 0; r < 64; r++) { + out[i * 64 + r] <<= 1; + out[i * 64 + r] |= (in[i][j] >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec out0[][GFBITS], vec out1[][GFBITS], const uint64_t *in) { + int i, j, r; + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out1[i][j] <<= 1; + out1[i][j] |= (in[i * 64 + r] >> (j + GFBITS)) & 1; + } + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out0[i][GFBITS - 1 - j] <<= 1; + out0[i][GFBITS - 1 - j] |= (in[i * 64 + r] >> j) & 1; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + int i, b, m = 0, r = 0; + + for (i = 0; i < 64; i++) { + b = ((int)in >> i) & 1; + m |= b; + r += (m ^ 1) & (b ^ 1); + } + + return r; +} + + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 63) / 64) ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + PQCLEAN_MCELIECE348864F_VEC_transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + PQCLEAN_MCELIECE348864F_VEC_transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << 32 >> 32) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> 32 << 32) | (buf[j] >> 32); + } + } + + return 0; +} + +int PQCLEAN_MCELIECE348864F_VEC_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) { +#define NBLOCKS_H ((SYS_N + 63) / 64) +#define NBLOCKS_I ((GFBITS * SYS_T + 63) / 64) + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS_H ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS_I ]; + + uint64_t mask; + + uint64_t irr_int[ GFBITS ]; + + vec consts[64][ GFBITS ]; + vec eval[ 64 ][ GFBITS ]; + vec prod[ 64 ][ GFBITS ]; + vec tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE348864F_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE348864F_VEC_fft(eval, irr_int); + + PQCLEAN_MCELIECE348864F_VEC_vec_copy(prod[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE348864F_VEC_vec_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864F_VEC_vec_inv(tmp, prod[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE348864F_VEC_vec_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE348864F_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864F_VEC_vec_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE348864F_VEC_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS_H; j++) { + PQCLEAN_MCELIECE348864F_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + // gaussian elimination + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = NBLOCKS_I; j < NBLOCKS_H - 1; j++) { + PQCLEAN_MCELIECE348864F_VEC_store8(pk, mat[i][j]); + pk += 8; + } + + PQCLEAN_MCELIECE348864F_VEC_store_i(pk, mat[i][j], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece348864f/vec/pk_gen.h b/crypto_kem/mceliece348864f/vec/pk_gen.h new file mode 100644 index 00000000..5666efea --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_PK_GEN_H +#define PQCLEAN_MCELIECE348864F_VEC_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE348864F_VEC_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/vec/powers.inc b/crypto_kem/mceliece348864f/vec/powers.inc new file mode 100644 index 00000000..a37fb2bd --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/powers.inc @@ -0,0 +1,896 @@ +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0x3333CCCC3333CCCC, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0xF0F0F0F00F0F0F0F, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0xF0F0F0F0F0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0x0000000000000000, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0xFFFFFFFFFFFFFFFF, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0xAA55AA55AA55AA55, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +}, +{ + 0x0F0F0F0FF0F0F0F0, + 0x0000FFFF0000FFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0F0F0F0F0F0F0F0F, + 0xFFFFFFFFFFFFFFFF, + 0x55AA55AA55AA55AA, + 0x0F0F0F0FF0F0F0F0, + 0x0000000000000000, + 0x00FF00FF00FF00FF, + 0xF0F0F0F0F0F0F0F0, + 0xCCCC3333CCCC3333, + 0x5555555555555555 +} diff --git a/crypto_kem/mceliece348864f/vec/scalars.inc b/crypto_kem/mceliece348864f/vec/scalars.inc new file mode 100644 index 00000000..aa8f64b9 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/scalars.inc @@ -0,0 +1,70 @@ +{ + 0XF3CFC030FC30F003, + 0X3FCF0F003C00C00C, + 0X30033CC300C0C03C, + 0XCCFF0F3C0F30F0C0, + 0X0300C03FF303C3F0, + 0X3FFF3C0FF0CCCCC0, + 0XF3FFF0C00F3C3CC0, + 0X3003333FFFC3C000, + 0X0FF30FFFC3FFF300, + 0XFFC0F300F0F0CC00, + 0XC0CFF3FCCC3CFC00, + 0XFC3C03F0F330C000, +}, +{ + 0X000F00000000F00F, + 0X00000F00F00000F0, + 0X0F00000F00000F00, + 0XF00F00F00F000000, + 0X00F00000000000F0, + 0X0000000F00000000, + 0XF00000000F00F000, + 0X00F00F00000F0000, + 0X0000F00000F00F00, + 0X000F00F00F00F000, + 0X00F00F0000000000, + 0X0000000000F00000, +}, +{ + 0X0000FF00FF0000FF, + 0X0000FF000000FF00, + 0XFF0000FF00FF0000, + 0XFFFF0000FF000000, + 0X00FF00FF00FF0000, + 0X0000FFFFFF000000, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF0000, + 0XFFFF00FFFF00FF00, + 0X0000FF0000000000, + 0XFFFFFF00FF000000, + 0X00FF000000000000, +}, +{ + 0X000000000000FFFF, + 0X00000000FFFF0000, + 0X0000000000000000, + 0XFFFF000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +}, +{ + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +} diff --git a/crypto_kem/mceliece348864f/vec/scalars_2x.inc b/crypto_kem/mceliece348864f/vec/scalars_2x.inc new file mode 100644 index 00000000..e7c7fee5 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/scalars_2x.inc @@ -0,0 +1,140 @@ +{{ + 0XF3CFC030FC30F003, + 0X3FCF0F003C00C00C, + 0X30033CC300C0C03C, + 0XCCFF0F3C0F30F0C0, + 0X0300C03FF303C3F0, + 0X3FFF3C0FF0CCCCC0, + 0XF3FFF0C00F3C3CC0, + 0X3003333FFFC3C000, + 0X0FF30FFFC3FFF300, + 0XFFC0F300F0F0CC00, + 0XC0CFF3FCCC3CFC00, + 0XFC3C03F0F330C000, +}, +{ + 0X000C03C0C3C0330C, + 0XF330CFFCC00F33C0, + 0XCCF330F00F3C0333, + 0XFF03FFF3FF0CF0C0, + 0X3CC3FCF00FCC303C, + 0X0F000C0FC30303F3, + 0XCF0FC3FF333CCF3C, + 0X003F3FC3C0FF333F, + 0X3CC3F0F3CF0FF00F, + 0XF3F33CC03FC30CC0, + 0X3CC330CFC333F33F, + 0X3CC0303FF3C3FFFC, +}}, +{{ + 0X000F00000000F00F, + 0X00000F00F00000F0, + 0X0F00000F00000F00, + 0XF00F00F00F000000, + 0X00F00000000000F0, + 0X0000000F00000000, + 0XF00000000F00F000, + 0X00F00F00000F0000, + 0X0000F00000F00F00, + 0X000F00F00F00F000, + 0X00F00F0000000000, + 0X0000000000F00000, +}, +{ + 0X0F00F00F00000000, + 0XF00000000000F000, + 0X00000F00000000F0, + 0X0F00F00000F00000, + 0X000F00000F00F00F, + 0X00F00F00F00F0000, + 0X0F00F00000000000, + 0X000000000F000000, + 0X00F00000000F00F0, + 0X0000F00F00000F00, + 0XF00000F00000F00F, + 0X00000F00F00F00F0, +}}, +{{ + 0X0000FF00FF0000FF, + 0X0000FF000000FF00, + 0XFF0000FF00FF0000, + 0XFFFF0000FF000000, + 0X00FF00FF00FF0000, + 0X0000FFFFFF000000, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF0000, + 0XFFFF00FFFF00FF00, + 0X0000FF0000000000, + 0XFFFFFF00FF000000, + 0X00FF000000000000, +}, +{ + 0XFF00FFFFFF000000, + 0XFF0000FFFF000000, + 0XFFFF00FFFF000000, + 0XFF00FFFFFFFFFF00, + 0X00000000FF00FF00, + 0XFFFFFFFF00FF0000, + 0X00FFFFFF00FF0000, + 0XFFFF00FFFF00FFFF, + 0XFFFF0000FFFFFFFF, + 0XFF00000000FF0000, + 0X000000FF00FF00FF, + 0X00FF00FF00FFFF00, +}}, +{{ + 0X000000000000FFFF, + 0X00000000FFFF0000, + 0X0000000000000000, + 0XFFFF000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +}, +{ + 0X0000000000000000, + 0XFFFF000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFF00000000FFFF, + 0X0000000000000000, + 0X0000FFFF00000000, + 0XFFFF00000000FFFF, + 0X00000000FFFF0000, + 0X0000000000000000, + 0XFFFF00000000FFFF, + 0X00000000FFFF0000, +}}, +{{ + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +}, +{ + 0X0000000000000000, + 0X0000000000000000, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000, +}} diff --git a/crypto_kem/mceliece348864f/vec/sk_gen.c b/crypto_kem/mceliece348864f/vec/sk_gen.c new file mode 100644 index 00000000..ff6a6555 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE348864F_VEC_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE348864F_VEC_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE348864F_VEC_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE348864F_VEC_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE348864F_VEC_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864F_VEC_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE348864F_VEC_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE348864F_VEC_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece348864f/vec/sk_gen.h b/crypto_kem/mceliece348864f/vec/sk_gen.h new file mode 100644 index 00000000..ae90c33e --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_SK_GEN_H +#define PQCLEAN_MCELIECE348864F_VEC_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE348864F_VEC_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE348864F_VEC_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece348864f/vec/transpose.c b/crypto_kem/mceliece348864f/vec/transpose.c new file mode 100644 index 00000000..a28df8f1 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/transpose.c @@ -0,0 +1,35 @@ +#include "transpose.h" + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE348864F_VEC_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} diff --git a/crypto_kem/mceliece348864f/vec/transpose.h b/crypto_kem/mceliece348864f/vec/transpose.h new file mode 100644 index 00000000..ca07570d --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/transpose.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_TRANSPOSE_H +#define PQCLEAN_MCELIECE348864F_VEC_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + + +void PQCLEAN_MCELIECE348864F_VEC_transpose_64x64(uint64_t *out, const uint64_t *in); + +#endif + diff --git a/crypto_kem/mceliece348864f/vec/util.c b/crypto_kem/mceliece348864f/vec/util.c new file mode 100644 index 00000000..7ba6e65e --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/util.c @@ -0,0 +1,94 @@ +#include "util.h" + + +void PQCLEAN_MCELIECE348864F_VEC_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864F_VEC_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE348864F_VEC_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE348864F_VEC_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE348864F_VEC_irr_load(uint64_t *out, const unsigned char *in) { + int i, j; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE348864F_VEC_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + out[i] = 0; + } + + for (i = SYS_T; i >= 0; i--) { + for (j = 0; j < GFBITS; j++) { + out[j] <<= 1; + out[j] |= (irr[i] >> j) & 1; + } + } +} + +void PQCLEAN_MCELIECE348864F_VEC_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE348864F_VEC_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE348864F_VEC_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 4; +} diff --git a/crypto_kem/mceliece348864f/vec/util.h b/crypto_kem/mceliece348864f/vec/util.h new file mode 100644 index 00000000..fec3935e --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/util.h @@ -0,0 +1,30 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_UTIL_H +#define PQCLEAN_MCELIECE348864F_VEC_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" + +#include + +void PQCLEAN_MCELIECE348864F_VEC_store_i(unsigned char *out, uint64_t in, int i); + +void PQCLEAN_MCELIECE348864F_VEC_store2(unsigned char *dest, gf a); + +uint16_t PQCLEAN_MCELIECE348864F_VEC_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE348864F_VEC_load4(const unsigned char *src); + + +void PQCLEAN_MCELIECE348864F_VEC_irr_load(uint64_t *out, const unsigned char *in); + +void PQCLEAN_MCELIECE348864F_VEC_store8(unsigned char *out, uint64_t in); + + +uint64_t PQCLEAN_MCELIECE348864F_VEC_load8(const unsigned char *in); + +gf PQCLEAN_MCELIECE348864F_VEC_bitrev(gf a); + +#endif diff --git a/crypto_kem/mceliece348864f/vec/vec.c b/crypto_kem/mceliece348864f/vec/vec.c new file mode 100644 index 00000000..c3981dc9 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/vec.c @@ -0,0 +1,131 @@ +#include "vec.h" + +#include "params.h" + +vec PQCLEAN_MCELIECE348864F_VEC_vec_setbits(vec b) { + vec ret = -b; + + return ret; +} + +vec PQCLEAN_MCELIECE348864F_VEC_vec_set1_16b(uint16_t v) { + vec ret; + + ret = v; + ret |= ret << 16; + ret |= ret << 32; + + return ret; +} + +void PQCLEAN_MCELIECE348864F_VEC_vec_copy(vec *out, const vec *in) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i] = in[i]; + } +} + +vec PQCLEAN_MCELIECE348864F_VEC_vec_or_reduce(const vec *a) { + int i; + vec ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret |= a[i]; + } + + return ret; +} + +int PQCLEAN_MCELIECE348864F_VEC_vec_testz(vec a) { + a |= a >> 32; + a |= a >> 16; + a |= a >> 8; + a |= a >> 4; + a |= a >> 2; + a |= a >> 1; + + return ((int)a & 1) ^ 1; +} + +void PQCLEAN_MCELIECE348864F_VEC_vec_mul(vec *h, const vec *f, const vec *g) { + int i, j; + vec buf[ 2 * GFBITS - 1 ]; + + for (i = 0; i < 2 * GFBITS - 1; i++) { + buf[i] = 0; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < GFBITS; j++) { + buf[i + j] ^= f[i] & g[j]; + } + } + + for (i = 2 * GFBITS - 2; i >= GFBITS; i--) { + buf[i - GFBITS + 3] ^= buf[i]; + buf[i - GFBITS + 0] ^= buf[i]; + } + + for (i = 0; i < GFBITS; i++) { + h[i] = buf[i]; + } +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE348864F_VEC_vec_sq(vec *out, const vec *in) { + int i; + uint64_t result[GFBITS]; + + // + + result[0] = in[0] ^ in[6]; + result[1] = in[11]; + result[2] = in[1] ^ in[7]; + result[3] = in[6]; + result[4] = in[2] ^ in[11] ^ in[8]; + result[5] = in[7]; + result[6] = in[3] ^ in[9]; + result[7] = in[8]; + result[8] = in[4] ^ in[10]; + result[9] = in[9]; + result[10] = in[5] ^ in[11]; + result[11] = in[10]; + + // + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE348864F_VEC_vec_inv(vec *out, const vec *in) { + uint64_t tmp_11[GFBITS]; + uint64_t tmp_1111[GFBITS]; + + PQCLEAN_MCELIECE348864F_VEC_vec_copy(out, in); + + PQCLEAN_MCELIECE348864F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864F_VEC_vec_mul(tmp_11, out, in); // 11 + + PQCLEAN_MCELIECE348864F_VEC_vec_sq(out, tmp_11); + PQCLEAN_MCELIECE348864F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864F_VEC_vec_mul(tmp_1111, out, tmp_11); // 1111 + + PQCLEAN_MCELIECE348864F_VEC_vec_sq(out, tmp_1111); + PQCLEAN_MCELIECE348864F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864F_VEC_vec_mul(out, out, tmp_1111); // 11111111 + + PQCLEAN_MCELIECE348864F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864F_VEC_vec_mul(out, out, tmp_11); // 1111111111 + + PQCLEAN_MCELIECE348864F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE348864F_VEC_vec_mul(out, out, in); // 11111111111 + + PQCLEAN_MCELIECE348864F_VEC_vec_sq(out, out); // 111111111110 +} diff --git a/crypto_kem/mceliece348864f/vec/vec.h b/crypto_kem/mceliece348864f/vec/vec.h new file mode 100644 index 00000000..e76dddf1 --- /dev/null +++ b/crypto_kem/mceliece348864f/vec/vec.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE348864F_VEC_VEC_H +#define PQCLEAN_MCELIECE348864F_VEC_VEC_H + +#include "params.h" + +#include + +typedef uint64_t vec; + +vec PQCLEAN_MCELIECE348864F_VEC_vec_setbits(vec b); + +vec PQCLEAN_MCELIECE348864F_VEC_vec_set1_16b(uint16_t v); + +void PQCLEAN_MCELIECE348864F_VEC_vec_copy(vec *out, const vec *in); + +vec PQCLEAN_MCELIECE348864F_VEC_vec_or_reduce(const vec *a); + +int PQCLEAN_MCELIECE348864F_VEC_vec_testz(vec a); + +void PQCLEAN_MCELIECE348864F_VEC_vec_mul(vec * /*h*/, const vec * /*f*/, const vec * /*g*/); +void PQCLEAN_MCELIECE348864F_VEC_vec_sq(vec * /*out*/, const vec * /*in*/); +void PQCLEAN_MCELIECE348864F_VEC_vec_inv(vec * /*out*/, const vec * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece460896/META.yml b/crypto_kem/mceliece460896/META.yml new file mode 100644 index 00000000..79340500 --- /dev/null +++ b/crypto_kem/mceliece460896/META.yml @@ -0,0 +1,48 @@ +name: Classic McEliece 460896 +type: kem +claimed-nist-level: 3 +claimed-security: IND-CCA2 +length-public-key: 524160 +length-secret-key: 13568 +length-ciphertext: 188 +length-shared-secret: 32 +nistkat-sha256: b0822a5d00d7fad26380044c77b33370a5fb38e7851263229f590cac323a46a7 +principal-submitters: + - Daniel J. Bernstein + - Tung Chou + - Tanja Lange + - Ingo von Maurich + - Rafael Misoczki + - Ruben Niederhagen + - Edoardo Persichetti + - Christiane Peters + - Peter Schwabe + - Nicolas Sendrier + - Jakub Szefer + - Wen Wang +auxiliary-submitters: [] +implementations: + - name: clean + version: SUPERCOP-20191221 + - name: vec + version: SUPERCOP-20191221 + - name: sse + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - sse4_1 + - popcnt + - name: avx + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - popcnt diff --git a/crypto_kem/mceliece460896/avx/LICENSE b/crypto_kem/mceliece460896/avx/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece460896/avx/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece460896/avx/Makefile b/crypto_kem/mceliece460896/avx/Makefile new file mode 100644 index 00000000..fc6849f2 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/Makefile @@ -0,0 +1,44 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece460896_avx.a + + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c int32_sort.c operations.c pk_gen.c sk_gen.c \ + transpose.c uint32_sort.c util.c vec128.c vec256.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S \ + transpose_64x256_sp_asm.S update_asm.S vec128_mul_asm.S \ + vec256_ama_asm.S vec256_maa_asm.S vec256_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h int32_sort.h \ + params.h pk_gen.h sk_gen.h transpose.h uint32_sort.h util.h vec128.h \ + vec256.h \ + consts.inc scalars_2x.inc scalars_4x.inc + + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o int32_sort.o operations.o pk_gen.o sk_gen.o \ + transpose.o uint32_sort.o util.o vec128.o vec256.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + transpose_64x256_sp_asm.o update_asm.o vec128_mul_asm.o \ + vec256_ama_asm.o vec256_maa_asm.o vec256_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mpopcnt -mavx2 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece460896/avx/aes256ctr.c b/crypto_kem/mceliece460896/avx/aes256ctr.c new file mode 100644 index 00000000..11fec7df --- /dev/null +++ b/crypto_kem/mceliece460896/avx/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE460896_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece460896/avx/aes256ctr.h b/crypto_kem/mceliece460896/avx/aes256ctr.h new file mode 100644 index 00000000..405a3bef --- /dev/null +++ b/crypto_kem/mceliece460896/avx/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_AES256CTR_H +#define PQCLEAN_MCELIECE460896_AVX_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE460896_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece460896/avx/api.h b/crypto_kem/mceliece460896/avx/api.h new file mode 100644 index 00000000..9b7bff77 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_API_H +#define PQCLEAN_MCELIECE460896_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE460896_AVX_CRYPTO_ALGNAME "Classic McEliece 460896" +#define PQCLEAN_MCELIECE460896_AVX_CRYPTO_PUBLICKEYBYTES 524160 +#define PQCLEAN_MCELIECE460896_AVX_CRYPTO_SECRETKEYBYTES 13568 +#define PQCLEAN_MCELIECE460896_AVX_CRYPTO_CIPHERTEXTBYTES 188 +#define PQCLEAN_MCELIECE460896_AVX_CRYPTO_BYTES 32 + +int PQCLEAN_MCELIECE460896_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE460896_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE460896_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece460896/avx/benes.c b/crypto_kem/mceliece460896/avx/benes.c new file mode 100644 index 00000000..64ef249a --- /dev/null +++ b/crypto_kem/mceliece460896/avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE460896_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE460896_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE460896_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE460896_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE460896_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(PQCLEAN_MCELIECE460896_AVX_load8(ptr), PQCLEAN_MCELIECE460896_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE460896_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(PQCLEAN_MCELIECE460896_AVX_load8(ptr), PQCLEAN_MCELIECE460896_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE460896_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece460896/avx/benes.h b/crypto_kem/mceliece460896/avx/benes.h new file mode 100644 index 00000000..5bb2798d --- /dev/null +++ b/crypto_kem/mceliece460896/avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_BENES_H +#define PQCLEAN_MCELIECE460896_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE460896_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE460896_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece460896/avx/bm.c b/crypto_kem/mceliece460896/avx/bm.c new file mode 100644 index 00000000..687be14c --- /dev/null +++ b/crypto_kem/mceliece460896/avx/bm.c @@ -0,0 +1,210 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE460896_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE460896_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE460896_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE460896_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE460896_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE460896_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE460896_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE460896_AVX_vec256_or(PQCLEAN_MCELIECE460896_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE460896_AVX_vec256_sll_4x(PQCLEAN_MCELIECE460896_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE460896_AVX_vec256_or(PQCLEAN_MCELIECE460896_AVX_vec256_srl_4x(PQCLEAN_MCELIECE460896_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE460896_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE460896_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE460896_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + uint64_t v[2]; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + vec128 db[ GFBITS ][ 2 ]; + vec128 BC_tmp[ GFBITS ][ 2 ]; + vec128 BC[ GFBITS ][ 2 ]; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC[0][0] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0, one << 62); + BC[0][1] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0, one << 63); + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = PQCLEAN_MCELIECE460896_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE460896_AVX_vec128_setzero(); + } + + for (N = 0; N < SYS_T * 2; N++) { + PQCLEAN_MCELIECE460896_AVX_update_asm(interval, coefs[N], 16); + PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm(prod, interval, BC[0] + 1, 32); + + d = PQCLEAN_MCELIECE460896_AVX_vec_reduce_asm(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = PQCLEAN_MCELIECE460896_AVX_vec128_setbits((d >> i) & 1); + db[i][1] = PQCLEAN_MCELIECE460896_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_mul((vec256 *) BC_tmp, (vec256 *) db, (vec256 *) BC); + + vec128_cmov(BC, mask); + PQCLEAN_MCELIECE460896_AVX_update_asm(BC, 0, 32); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(BC_tmp[i][0], BC_tmp[i][1]); + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(BC[i][1], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(BC[i][1], 1); + + out[i] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x((v[0] >> 31) | (v[1] << 33), v[1] >> 31); + } +} + diff --git a/crypto_kem/mceliece460896/avx/bm.h b/crypto_kem/mceliece460896/avx/bm.h new file mode 100644 index 00000000..40261250 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_BM_H +#define PQCLEAN_MCELIECE460896_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE460896_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece460896/avx/consts.S b/crypto_kem/mceliece460896/avx/consts.S new file mode 100644 index 00000000..de62f1b0 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE460896_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE460896_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE460896_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE460896_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE460896_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE460896_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE460896_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE460896_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE460896_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE460896_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE460896_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE460896_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE460896_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE460896_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE460896_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE460896_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE460896_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE460896_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE460896_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE460896_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE460896_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE460896_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE460896_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE460896_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece460896/avx/consts.inc b/crypto_kem/mceliece460896/avx/consts.inc new file mode 100644 index 00000000..217965d3 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/crypto_kem/mceliece460896/avx/controlbits.c b/crypto_kem/mceliece460896/avx/controlbits.c new file mode 100644 index 00000000..f3cee38a --- /dev/null +++ b/crypto_kem/mceliece460896/avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE460896_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE460896_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE460896_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE460896_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece460896/avx/controlbits.h b/crypto_kem/mceliece460896/avx/controlbits.h new file mode 100644 index 00000000..eb40ca1d --- /dev/null +++ b/crypto_kem/mceliece460896/avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE460896_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE460896_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE460896_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece460896/avx/crypto_hash.h b/crypto_kem/mceliece460896/avx/crypto_hash.h new file mode 100644 index 00000000..439f460a --- /dev/null +++ b/crypto_kem/mceliece460896/avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE460896_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece460896/avx/decrypt.c b/crypto_kem/mceliece460896/avx/decrypt.c new file mode 100644 index 00000000..e1385f44 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/decrypt.c @@ -0,0 +1,234 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE460896_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE460896_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE460896_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE460896_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE460896_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE460896_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE460896_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE460896_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE460896_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE460896_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE460896_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE460896_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE460896_AVX_vec256_or(diff, PQCLEAN_MCELIECE460896_AVX_vec256_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE460896_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE460896_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 32 ][ GFBITS ]; + vec256 scaled[ 32 ][ GFBITS ]; + vec256 eval[32][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE460896_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE460896_AVX_benes(recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE460896_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE460896_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE460896_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE460896_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE460896_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE460896_AVX_benes(error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece460896/avx/decrypt.h b/crypto_kem/mceliece460896/avx/decrypt.h new file mode 100644 index 00000000..a84d461a --- /dev/null +++ b/crypto_kem/mceliece460896/avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE460896_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE460896_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece460896/avx/encrypt.c b/crypto_kem/mceliece460896/avx/encrypt.c new file mode 100644 index 00000000..b7c6a960 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/encrypt.c @@ -0,0 +1,99 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE460896_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE460896_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + PQCLEAN_MCELIECE460896_AVX_store8(e, e_int[i]); + e += 8; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE460896_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE460896_AVX_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece460896/avx/encrypt.h b/crypto_kem/mceliece460896/avx/encrypt.h new file mode 100644 index 00000000..c223c3e9 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE460896_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE460896_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece460896/avx/fft.c b/crypto_kem/mceliece460896/avx/fft.c new file mode 100644 index 00000000..1deb9d6d --- /dev/null +++ b/crypto_kem/mceliece460896/avx/fft.c @@ -0,0 +1,262 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE460896_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE460896_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE460896_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE460896_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE460896_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE460896_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE460896_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE460896_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + // transpose + + PQCLEAN_MCELIECE460896_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE460896_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE460896_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece460896/avx/fft.h b/crypto_kem/mceliece460896/avx/fft.h new file mode 100644 index 00000000..dc68724e --- /dev/null +++ b/crypto_kem/mceliece460896/avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_FFT_H +#define PQCLEAN_MCELIECE460896_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE460896_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece460896/avx/fft_tr.c b/crypto_kem/mceliece460896/avx/fft_tr.c new file mode 100644 index 00000000..fe24bb9a --- /dev/null +++ b/crypto_kem/mceliece460896/avx/fft_tr.c @@ -0,0 +1,398 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE460896_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE460896_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE460896_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE460896_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE460896_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE460896_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE460896_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +/* justifying the length of the output */ +static void postprocess(vec256 *out) { + int i; + uint64_t v[4]; + + for (i = 0; i < 13; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(out[i], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(out[i], 1); + v[2] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(out[i], 2); + v[3] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(out[i], 3); + + v[3] = 0; + + out[i] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +void PQCLEAN_MCELIECE460896_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/crypto_kem/mceliece460896/avx/fft_tr.h b/crypto_kem/mceliece460896/avx/fft_tr.h new file mode 100644 index 00000000..2dd47439 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE460896_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE460896_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece460896/avx/gf.c b/crypto_kem/mceliece460896/avx/gf.c new file mode 100644 index 00000000..903fbab4 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/gf.c @@ -0,0 +1,205 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE460896_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE460896_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE460896_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE460896_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE460896_AVX_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE460896_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[191]; + + for (i = 0; i < 191; i++) { + prod[i] = 0; + } + + for (i = 0; i < 96; i++) { + for (j = 0; j < 96; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE460896_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 190; i >= 96; i--) { + prod[i - 85] ^= PQCLEAN_MCELIECE460896_AVX_gf_mul(prod[i], (gf) 714); + prod[i - 91] ^= PQCLEAN_MCELIECE460896_AVX_gf_mul(prod[i], (gf) 5296); + prod[i - 92] ^= PQCLEAN_MCELIECE460896_AVX_gf_mul(prod[i], (gf) 728); + prod[i - 96] ^= PQCLEAN_MCELIECE460896_AVX_gf_mul(prod[i], (gf) 5881); + } + + for (i = 0; i < 96; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece460896/avx/gf.h b/crypto_kem/mceliece460896/avx/gf.h new file mode 100644 index 00000000..97f85d4f --- /dev/null +++ b/crypto_kem/mceliece460896/avx/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_GF_H +#define PQCLEAN_MCELIECE460896_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE460896_AVX_gf_iszero(gf a); +gf PQCLEAN_MCELIECE460896_AVX_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE460896_AVX_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE460896_AVX_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE460896_AVX_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE460896_AVX_gf_inv(gf in); + +void PQCLEAN_MCELIECE460896_AVX_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece460896/avx/int32_sort.c b/crypto_kem/mceliece460896/avx/int32_sort.c new file mode 100644 index 00000000..6b71b048 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/int32_sort.c @@ -0,0 +1,1208 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = (p << 1 == q); + flipflip = !flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE460896_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE460896_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/crypto_kem/mceliece460896/avx/int32_sort.h b/crypto_kem/mceliece460896/avx/int32_sort.h new file mode 100644 index 00000000..91832cd6 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE460896_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE460896_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece460896/avx/operations.c b/crypto_kem/mceliece460896/avx/operations.c new file mode 100644 index 00000000..bb6da6d6 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE460896_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE460896_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE460896_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE460896_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE460896_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE460896_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE460896_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE460896_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE460896_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE460896_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE460896_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE460896_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE460896_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896/avx/params.h b/crypto_kem/mceliece460896/avx/params.h new file mode 100644 index 00000000..33cb311b --- /dev/null +++ b/crypto_kem/mceliece460896/avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_PARAMS_H +#define PQCLEAN_MCELIECE460896_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 4608 +#define SYS_T 96 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece460896/avx/pk_gen.c b/crypto_kem/mceliece460896/avx/pk_gen.c new file mode 100644 index 00000000..80bb5df0 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/pk_gen.c @@ -0,0 +1,290 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256) +int PQCLEAN_MCELIECE460896_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS1_I - 1; + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS1_I ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE460896_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE460896_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE460896_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE460896_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE460896_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_I; j++) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS1_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + uint64_t column[ PK_NROWS ]; + + for (i = 0; i < PK_NROWS; i++) { + column[i] = mat[ i ][ block_idx ]; + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + + for (i = 0; i < PK_NROWS; i++) { + mat[ i ][ block_idx ] = column[i]; + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = 0; k < NBLOCKS1_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + one_row[k] = (one_row[k] >> tail) | (one_row[k + 1] << (64 - tail)); + PQCLEAN_MCELIECE460896_AVX_store8(pk, one_row[k]); + pk += 8; + } + + one_row[k] >>= tail; + PQCLEAN_MCELIECE460896_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece460896/avx/pk_gen.h b/crypto_kem/mceliece460896/avx/pk_gen.h new file mode 100644 index 00000000..a43211f1 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE460896_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE460896_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece460896/avx/scalars_2x.inc b/crypto_kem/mceliece460896/avx/scalars_2x.inc new file mode 100644 index 00000000..78f194ea --- /dev/null +++ b/crypto_kem/mceliece460896/avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece460896/avx/scalars_4x.inc b/crypto_kem/mceliece460896/avx/scalars_4x.inc new file mode 100644 index 00000000..47155d66 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/crypto_kem/mceliece460896/avx/sk_gen.c b/crypto_kem/mceliece460896/avx/sk_gen.c new file mode 100644 index 00000000..3670a34d --- /dev/null +++ b/crypto_kem/mceliece460896/avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE460896_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE460896_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE460896_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE460896_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE460896_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE460896_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE460896_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE460896_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896/avx/sk_gen.h b/crypto_kem/mceliece460896/avx/sk_gen.h new file mode 100644 index 00000000..8bed1da4 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE460896_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE460896_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE460896_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece460896/avx/syndrome_asm.S b/crypto_kem/mceliece460896/avx/syndrome_asm.S new file mode 100644 index 00000000..88ceaadc --- /dev/null +++ b/crypto_kem/mceliece460896/avx/syndrome_asm.S @@ -0,0 +1,650 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896_AVX_syndrome_asm +.global PQCLEAN_MCELIECE460896_AVX_syndrome_asm +_PQCLEAN_MCELIECE460896_AVX_syndrome_asm: +PQCLEAN_MCELIECE460896_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 523740 +# asm 1: add $523740,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1248 +# asm 1: mov $1248,>row=int64#5 +# asm 2: mov $1248,>row=%r8 +mov $1248,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 156 ] +# asm 1: vmovupd 156(ee=reg256#2 +# asm 2: vmovupd 156(ee=%ymm1 +vmovupd 156(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 188 ] +# asm 1: vmovupd 188(ee=reg256#3 +# asm 2: vmovupd 188(ee=%ymm2 +vmovupd 188(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 220 ] +# asm 1: vmovupd 220(ee=reg256#3 +# asm 2: vmovupd 220(ee=%ymm2 +vmovupd 220(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 252 ] +# asm 1: vmovupd 252(ee=reg256#3 +# asm 2: vmovupd 252(ee=%ymm2 +vmovupd 252(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 284 ] +# asm 1: vmovupd 284(ee=reg256#3 +# asm 2: vmovupd 284(ee=%ymm2 +vmovupd 284(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 316 ] +# asm 1: vmovupd 316(ee=reg256#3 +# asm 2: vmovupd 316(ee=%ymm2 +vmovupd 316(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 348 ] +# asm 1: vmovupd 348(ee=reg256#3 +# asm 2: vmovupd 348(ee=%ymm2 +vmovupd 348(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 380 ] +# asm 1: vmovupd 380(ee=reg256#3 +# asm 2: vmovupd 380(ee=%ymm2 +vmovupd 380(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 412 ] +# asm 1: vmovupd 412(ee=reg256#3 +# asm 2: vmovupd 412(ee=%ymm2 +vmovupd 412(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 444 ] +# asm 1: vmovupd 444(ee=reg256#3 +# asm 2: vmovupd 444(ee=%ymm2 +vmovupd 444(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 476 ] +# asm 1: vmovupd 476(ee=reg256#3 +# asm 2: vmovupd 476(ee=%ymm2 +vmovupd 476(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 508 ] +# asm 1: vmovupd 508(ee=reg256#3 +# asm 2: vmovupd 508(ee=%ymm2 +vmovupd 508(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 540 ] +# asm 1: vmovupd 540(ee=reg256#3 +# asm 2: vmovupd 540(ee=%ymm2 +vmovupd 540(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = *(uint32 *)(input_1 + 416) +# asm 1: movl 416(s=int64#6d +# asm 2: movl 416(s=%r9d +movl 416(%rsi),%r9d + +# qhasm: e = *(uint32 *)(input_2 + 572) +# asm 1: movl 572(e=int64#7d +# asm 2: movl 572(e=%eax +movl 572(%rdx),%eax + +# qhasm: s &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 96(ss=%ymm0 +vmovupd 96(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor s=int64#2 +# asm 2: movq 128(s=%rsi +movq 128(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 128 ] +# asm 1: movq 128(e=int64#4 +# asm 2: movq 128(e=%rcx +movq 128(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 136(s=%rsi +movq 136(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 136 ] +# asm 1: movq 136(e=int64#4 +# asm 2: movq 136(e=%rcx +movq 136(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 144(s=%rsi +movq 144(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 144 ] +# asm 1: movq 144(e=int64#4 +# asm 2: movq 144(e=%rcx +movq 144(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2d +# asm 2: movl 152(s=%esi +movl 152(%rdi),%esi + +# qhasm: e = *(uint32 *)( input_2 + 152 ) +# asm 1: movl 152(e=int64#3d +# asm 2: movl 152(e=%edx +movl 152(%rdx),%edx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE460896_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece460896/avx/update_asm.S b/crypto_kem/mceliece460896/avx/update_asm.S new file mode 100644 index 00000000..33046f9d --- /dev/null +++ b/crypto_kem/mceliece460896/avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896_AVX_update_asm +.global PQCLEAN_MCELIECE460896_AVX_update_asm +_PQCLEAN_MCELIECE460896_AVX_update_asm: +PQCLEAN_MCELIECE460896_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE460896_AVX_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE460896_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE460896_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE460896_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE460896_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE460896_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE460896_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE460896_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE460896_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE460896_AVX_vec128_set2x( PQCLEAN_MCELIECE460896_AVX_load8(in), PQCLEAN_MCELIECE460896_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE460896_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE460896_AVX_store8(out + 0, PQCLEAN_MCELIECE460896_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE460896_AVX_store8(out + 8, PQCLEAN_MCELIECE460896_AVX_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece460896/avx/util.h b/crypto_kem/mceliece460896/avx/util.h new file mode 100644 index 00000000..6a01bd94 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_UTIL_H +#define PQCLEAN_MCELIECE460896_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE460896_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE460896_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE460896_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE460896_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE460896_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE460896_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE460896_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE460896_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE460896_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece460896/avx/vec128.c b/crypto_kem/mceliece460896/avx/vec128.c new file mode 100644 index 00000000..1765598f --- /dev/null +++ b/crypto_kem/mceliece460896/avx/vec128.c @@ -0,0 +1,83 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE460896_AVX_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE460896_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE460896_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE460896_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/crypto_kem/mceliece460896/avx/vec128.h b/crypto_kem/mceliece460896/avx/vec128.h new file mode 100644 index 00000000..cb6fa2b2 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_VEC128_H +#define PQCLEAN_MCELIECE460896_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE460896_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE460896_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE460896_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE460896_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/crypto_kem/mceliece460896/avx/vec128_mul_asm.S b/crypto_kem/mceliece460896/avx/vec128_mul_asm.S new file mode 100644 index 00000000..7387f125 --- /dev/null +++ b/crypto_kem/mceliece460896/avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE460896_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE460896_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE460896_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE460896_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE460896_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE460896_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/crypto_kem/mceliece460896/avx/vec256_ama_asm.S b/crypto_kem/mceliece460896/avx/vec256_ama_asm.S new file mode 100644 index 00000000..6a4253fa --- /dev/null +++ b/crypto_kem/mceliece460896/avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE460896_CLEAN_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece460896/clean/api.h b/crypto_kem/mceliece460896/clean/api.h new file mode 100644 index 00000000..49fe2eb1 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_API_H +#define PQCLEAN_MCELIECE460896_CLEAN_API_H + +#include + +#define PQCLEAN_MCELIECE460896_CLEAN_CRYPTO_ALGNAME "Classic McEliece 460896" +#define PQCLEAN_MCELIECE460896_CLEAN_CRYPTO_PUBLICKEYBYTES 524160 +#define PQCLEAN_MCELIECE460896_CLEAN_CRYPTO_SECRETKEYBYTES 13568 +#define PQCLEAN_MCELIECE460896_CLEAN_CRYPTO_CIPHERTEXTBYTES 188 +#define PQCLEAN_MCELIECE460896_CLEAN_CRYPTO_BYTES 32 + +int PQCLEAN_MCELIECE460896_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE460896_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE460896_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece460896/clean/benes.c b/crypto_kem/mceliece460896/clean/benes.c new file mode 100644 index 00000000..b8bada2d --- /dev/null +++ b/crypto_kem/mceliece460896/clean/benes.c @@ -0,0 +1,180 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE460896_CLEAN_apply_benes(unsigned char *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + unsigned char *r_ptr = r; + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = PQCLEAN_MCELIECE460896_CLEAN_load8(r_ptr + i * 16 + 0); + r_int_v[1][i] = PQCLEAN_MCELIECE460896_CLEAN_load8(r_ptr + i * 16 + 8); + } + + PQCLEAN_MCELIECE460896_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE460896_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE460896_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE460896_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE460896_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE460896_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE460896_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE460896_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE460896_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE460896_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE460896_CLEAN_store8(r_ptr + i * 16 + 0, r_int_v[0][i]); + PQCLEAN_MCELIECE460896_CLEAN_store8(r_ptr + i * 16 + 8, r_int_v[1][i]); + } +} + +/* input: condition bits c */ +/* output: support s */ +void PQCLEAN_MCELIECE460896_CLEAN_support_gen(gf *s, const unsigned char *c) { + gf a; + int i, j; + unsigned char L[ GFBITS ][ (1 << GFBITS) / 8 ]; + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < (1 << GFBITS) / 8; j++) { + L[i][j] = 0; + } + } + + for (i = 0; i < (1 << GFBITS); i++) { + a = PQCLEAN_MCELIECE460896_CLEAN_bitrev((gf) i); + + for (j = 0; j < GFBITS; j++) { + L[j][ i / 8 ] |= ((a >> j) & 1) << (i % 8); + } + } + + for (j = 0; j < GFBITS; j++) { + PQCLEAN_MCELIECE460896_CLEAN_apply_benes(L[j], c, 0); + } + + for (i = 0; i < SYS_N; i++) { + s[i] = 0; + for (j = GFBITS - 1; j >= 0; j--) { + s[i] <<= 1; + s[i] |= (L[j][i / 8] >> (i % 8)) & 1; + } + } +} + diff --git a/crypto_kem/mceliece460896/clean/benes.h b/crypto_kem/mceliece460896/clean/benes.h new file mode 100644 index 00000000..dcc461de --- /dev/null +++ b/crypto_kem/mceliece460896/clean/benes.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_BENES_H +#define PQCLEAN_MCELIECE460896_CLEAN_BENES_H +/* + This file is for Benes network related functions +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE460896_CLEAN_apply_benes(uint8_t *r, const uint8_t *bits, int rev); +void PQCLEAN_MCELIECE460896_CLEAN_support_gen(gf *s, const uint8_t *c); + +#endif + diff --git a/crypto_kem/mceliece460896/clean/bm.c b/crypto_kem/mceliece460896/clean/bm.c new file mode 100644 index 00000000..2f6a4afa --- /dev/null +++ b/crypto_kem/mceliece460896/clean/bm.c @@ -0,0 +1,83 @@ +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ +#include "bm.h" + +#include "params.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +/* the Berlekamp-Massey algorithm */ +/* input: s, sequence of field elements */ +/* output: out, minimal polynomial of s */ +void PQCLEAN_MCELIECE460896_CLEAN_bm(gf *out, gf *s) { + int i; + + uint16_t N = 0; + uint16_t L = 0; + uint16_t mle; + uint16_t mne; + + gf T[ SYS_T + 1 ]; + gf C[ SYS_T + 1 ]; + gf B[ SYS_T + 1 ]; + + gf b = 1, d, f; + + // + + for (i = 0; i < SYS_T + 1; i++) { + C[i] = B[i] = 0; + } + + B[1] = C[0] = 1; + + // + + for (N = 0; N < 2 * SYS_T; N++) { + d = 0; + + for (i = 0; i <= min(N, SYS_T); i++) { + d ^= PQCLEAN_MCELIECE460896_CLEAN_gf_mul(C[i], s[ N - i]); + } + + mne = d; + mne -= 1; + mne >>= 15; + mne -= 1; + mle = N; + mle -= 2 * L; + mle >>= 15; + mle -= 1; + mle &= mne; + + for (i = 0; i <= SYS_T; i++) { + T[i] = C[i]; + } + + f = PQCLEAN_MCELIECE460896_CLEAN_gf_frac(b, d); + + for (i = 0; i <= SYS_T; i++) { + C[i] ^= PQCLEAN_MCELIECE460896_CLEAN_gf_mul(f, B[i]) & mne; + } + + L = (L & ~mle) | ((N + 1 - L) & mle); + + for (i = 0; i <= SYS_T; i++) { + B[i] = (B[i] & ~mle) | (T[i] & mle); + } + + b = (b & ~mle) | (d & mle); + + for (i = SYS_T; i >= 1; i--) { + B[i] = B[i - 1]; + } + B[0] = 0; + } + + for (i = 0; i <= SYS_T; i++) { + out[i] = C[ SYS_T - i ]; + } +} + diff --git a/crypto_kem/mceliece460896/clean/bm.h b/crypto_kem/mceliece460896/clean/bm.h new file mode 100644 index 00000000..60c256d7 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/bm.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_BM_H +#define PQCLEAN_MCELIECE460896_CLEAN_BM_H +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE460896_CLEAN_bm(gf * /*out*/, gf * /*s*/); + +#endif + diff --git a/crypto_kem/mceliece460896/clean/controlbits.c b/crypto_kem/mceliece460896/clean/controlbits.c new file mode 100644 index 00000000..04c5d25f --- /dev/null +++ b/crypto_kem/mceliece460896/clean/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE460896_CLEAN_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE460896_CLEAN_sort_63b(n / 2, x); + PQCLEAN_MCELIECE460896_CLEAN_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE460896_CLEAN_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece460896/clean/controlbits.h b/crypto_kem/mceliece460896/clean/controlbits.h new file mode 100644 index 00000000..71da68e3 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_CONTROLBITS_H +#define PQCLEAN_MCELIECE460896_CLEAN_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE460896_CLEAN_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE460896_CLEAN_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece460896/clean/crypto_hash.h b/crypto_kem/mceliece460896/clean/crypto_hash.h new file mode 100644 index 00000000..5310f602 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE460896_CLEAN_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece460896/clean/decrypt.c b/crypto_kem/mceliece460896/clean/decrypt.c new file mode 100644 index 00000000..c9ee5d94 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/decrypt.c @@ -0,0 +1,90 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "gf.h" +#include "params.h" +#include "root.h" +#include "synd.h" +#include "util.h" + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE460896_CLEAN_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i, w = 0; + uint16_t check; + + unsigned char r[ SYS_N / 8 ]; + + gf g[ SYS_T + 1 ]; + gf L[ SYS_N ]; + + gf s[ SYS_T * 2 ]; + gf s_cmp[ SYS_T * 2 ]; + gf locator[ SYS_T + 1 ]; + gf images[ SYS_N ]; + + gf t; + + // + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = c[i]; + } + for (i = SYND_BYTES; i < SYS_N / 8; i++) { + r[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE460896_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + g[ SYS_T ] = 1; + + PQCLEAN_MCELIECE460896_CLEAN_support_gen(L, sk); + + PQCLEAN_MCELIECE460896_CLEAN_synd(s, g, L, r); + + PQCLEAN_MCELIECE460896_CLEAN_bm(locator, s); + + PQCLEAN_MCELIECE460896_CLEAN_root(images, locator, L); + + // + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + } + + for (i = 0; i < SYS_N; i++) { + t = PQCLEAN_MCELIECE460896_CLEAN_gf_iszero(images[i]) & 1; + + e[ i / 8 ] |= t << (i % 8); + w += t; + + } + + PQCLEAN_MCELIECE460896_CLEAN_synd(s_cmp, g, L, e); + + // + + check = (uint16_t)w; + check ^= SYS_T; + + for (i = 0; i < SYS_T * 2; i++) { + check |= s[i] ^ s_cmp[i]; + } + + check -= 1; + check >>= 15; + + return check ^ 1; +} + diff --git a/crypto_kem/mceliece460896/clean/decrypt.h b/crypto_kem/mceliece460896/clean/decrypt.h new file mode 100644 index 00000000..84c75271 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_DECRYPT_H +#define PQCLEAN_MCELIECE460896_CLEAN_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE460896_CLEAN_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece460896/clean/encrypt.c b/crypto_kem/mceliece460896/clean/encrypt.c new file mode 100644 index 00000000..7212bcd8 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/encrypt.c @@ -0,0 +1,138 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include + +#include "gf.h" + +static inline uint8_t same_mask(uint16_t x, uint16_t y) { + uint32_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 31; + mask = -mask; + + return (uint8_t)mask; +} + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint8_t *ind_8 = (uint8_t *)ind_; + uint16_t ind[ SYS_T * 2 ]; + uint8_t mask; + unsigned char val[ SYS_T ]; + + while (1) { + randombytes(ind_8, sizeof(ind_)); + // Copy to uint16_t ind_ in a little-endian way + for (i = 0; i < sizeof(ind_); i += 2) { + ind_[i / 2] = ((uint16_t)ind_8[i + 1]) << 8 | (uint16_t)ind_8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = 1 << (ind[j] & 7); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = same_mask((uint16_t)i, (ind[j] >> 3)); + + e[i] |= val[j] & mask; + } + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + unsigned char b, row[SYS_N / 8]; + const unsigned char *pk_ptr = pk; + + int i, j; + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = 0; + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + row[j] = 0; + } + + for (j = 0; j < PK_ROW_BYTES; j++) { + row[ SYS_N / 8 - PK_ROW_BYTES + j ] = pk_ptr[j]; + } + + row[i / 8] |= 1 << (i % 8); + + b = 0; + for (j = 0; j < SYS_N / 8; j++) { + b ^= row[j] & e[j]; + } + + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] |= (b << (i % 8)); + + pk_ptr += PK_ROW_BYTES; + } +} + +void PQCLEAN_MCELIECE460896_CLEAN_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece460896/clean/encrypt.h b/crypto_kem/mceliece460896/clean/encrypt.h new file mode 100644 index 00000000..28efbe46 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_ENCRYPT_H +#define PQCLEAN_MCELIECE460896_CLEAN_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE460896_CLEAN_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece460896/clean/gf.c b/crypto_kem/mceliece460896/clean/gf.c new file mode 100644 index 00000000..7baa642c --- /dev/null +++ b/crypto_kem/mceliece460896/clean/gf.c @@ -0,0 +1,211 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE460896_CLEAN_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE460896_CLEAN_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE460896_CLEAN_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* input: field element in */ +/* return: (in^2)^2 */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: (in^2)*m */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: ((in^2)^2)*m */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE460896_CLEAN_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +gf PQCLEAN_MCELIECE460896_CLEAN_gf_inv(gf in) { + return PQCLEAN_MCELIECE460896_CLEAN_gf_frac(in, ((gf) 1)); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE460896_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE460896_CLEAN_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 11] ^= PQCLEAN_MCELIECE460896_CLEAN_gf_mul(prod[i], (gf) 714); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE460896_CLEAN_gf_mul(prod[i], (gf) 5296); + prod[i - SYS_T + 4] ^= PQCLEAN_MCELIECE460896_CLEAN_gf_mul(prod[i], (gf) 728); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE460896_CLEAN_gf_mul(prod[i], (gf) 5881); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece460896/clean/gf.h b/crypto_kem/mceliece460896/clean/gf.h new file mode 100644 index 00000000..90287309 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_GF_H +#define PQCLEAN_MCELIECE460896_CLEAN_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE460896_CLEAN_gf_iszero(gf a); +gf PQCLEAN_MCELIECE460896_CLEAN_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE460896_CLEAN_gf_mul(gf in0, gf in1); +gf PQCLEAN_MCELIECE460896_CLEAN_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE460896_CLEAN_gf_inv(gf in); +uint64_t PQCLEAN_MCELIECE460896_CLEAN_gf_mul2(gf a, gf b0, gf b1); + +void PQCLEAN_MCELIECE460896_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece460896/clean/operations.c b/crypto_kem/mceliece460896/clean/operations.c new file mode 100644 index 00000000..65d61de6 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE460896_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE460896_CLEAN_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE460896_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE460896_CLEAN_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE460896_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE460896_CLEAN_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE460896_CLEAN_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE460896_CLEAN_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE460896_CLEAN_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE460896_CLEAN_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE460896_CLEAN_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE460896_CLEAN_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE460896_CLEAN_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896/clean/params.h b/crypto_kem/mceliece460896/clean/params.h new file mode 100644 index 00000000..024407d5 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_PARAMS_H +#define PQCLEAN_MCELIECE460896_CLEAN_PARAMS_H + +#define GFBITS 13 +#define SYS_N 4608 +#define SYS_T 96 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece460896/clean/pk_gen.c b/crypto_kem/mceliece460896/clean/pk_gen.c new file mode 100644 index 00000000..3a4b8be9 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/pk_gen.c @@ -0,0 +1,144 @@ +/* + This file is for public-key generation +*/ + +#include + +#include "benes.h" +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "pk_gen.h" +#include "root.h" +#include "util.h" + +/* input: secret key sk */ +/* output: public key pk */ +int PQCLEAN_MCELIECE460896_CLEAN_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) { + int i, j, k; + int row, c; + + uint64_t buf[ 1 << GFBITS ]; + + uint8_t mat[ GFBITS * SYS_T ][ SYS_N / 8 ]; + uint8_t mask; + uint8_t b; + + gf g[ SYS_T + 1 ]; // Goppa polynomial + gf L[ SYS_N ]; // support + gf inv[ SYS_N ]; + + // + + g[ SYS_T ] = 1; + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE460896_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + + for (i = 0; i < (1 << GFBITS); i++) { + buf[i] = perm[i]; + buf[i] <<= 31; + buf[i] |= i; + } + + PQCLEAN_MCELIECE460896_CLEAN_sort_63b(1 << GFBITS, buf); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = buf[i] & GFMASK; + } + for (i = 0; i < SYS_N; i++) { + L[i] = PQCLEAN_MCELIECE460896_CLEAN_bitrev((gf)perm[i]); + } + + // filling the matrix + + PQCLEAN_MCELIECE460896_CLEAN_root(inv, g, L); + + for (i = 0; i < SYS_N; i++) { + inv[i] = PQCLEAN_MCELIECE460896_CLEAN_gf_inv(inv[i]); + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + mat[i][j] = 0; + } + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_N; j += 8) { + for (k = 0; k < GFBITS; k++) { + b = (inv[j + 7] >> k) & 1; + b <<= 1; + b |= (inv[j + 6] >> k) & 1; + b <<= 1; + b |= (inv[j + 5] >> k) & 1; + b <<= 1; + b |= (inv[j + 4] >> k) & 1; + b <<= 1; + b |= (inv[j + 3] >> k) & 1; + b <<= 1; + b |= (inv[j + 2] >> k) & 1; + b <<= 1; + b |= (inv[j + 1] >> k) & 1; + b <<= 1; + b |= (inv[j + 0] >> k) & 1; + + mat[ i * GFBITS + k ][ j / 8 ] = b; + } + } + + for (j = 0; j < SYS_N; j++) { + inv[j] = PQCLEAN_MCELIECE460896_CLEAN_gf_mul(inv[j], L[j]); + } + + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) { + for (j = 0; j < 8; j++) { + row = i * 8 + j; + + if (row >= GFBITS * SYS_T) { + break; + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] ^ mat[ k ][ i ]; + mask >>= j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < GFBITS * SYS_T; k++) { + if (k != row) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + } + + for (i = 0; i < PK_NROWS; i++) { + memcpy(pk + i * PK_ROW_BYTES, mat[i] + PK_NROWS / 8, PK_ROW_BYTES); + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896/clean/pk_gen.h b/crypto_kem/mceliece460896/clean/pk_gen.h new file mode 100644 index 00000000..f2fe6df6 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_PK_GEN_H +#define PQCLEAN_MCELIECE460896_CLEAN_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE460896_CLEAN_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece460896/clean/root.c b/crypto_kem/mceliece460896/clean/root.c new file mode 100644 index 00000000..c4259244 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/root.c @@ -0,0 +1,33 @@ +/* + This file is for evaluating a polynomial at one or more field elements +*/ +#include "root.h" + +#include "params.h" + +/* input: polynomial f and field element a */ +/* return f(a) */ +gf PQCLEAN_MCELIECE460896_CLEAN_eval(gf *f, gf a) { + int i; + gf r; + + r = f[ SYS_T ]; + + for (i = SYS_T - 1; i >= 0; i--) { + r = PQCLEAN_MCELIECE460896_CLEAN_gf_mul(r, a); + r = PQCLEAN_MCELIECE460896_CLEAN_gf_add(r, f[i]); + } + + return r; +} + +/* input: polynomial f and list of field elements L */ +/* output: out = [ f(a) for a in L ] */ +void PQCLEAN_MCELIECE460896_CLEAN_root(gf *out, gf *f, gf *L) { + int i; + + for (i = 0; i < SYS_N; i++) { + out[i] = PQCLEAN_MCELIECE460896_CLEAN_eval(f, L[i]); + } +} + diff --git a/crypto_kem/mceliece460896/clean/root.h b/crypto_kem/mceliece460896/clean/root.h new file mode 100644 index 00000000..ae4c4aec --- /dev/null +++ b/crypto_kem/mceliece460896/clean/root.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_ROOT_H +#define PQCLEAN_MCELIECE460896_CLEAN_ROOT_H +/* + This file is for evaluating a polynomial at one or more field elements +*/ + + +#include "gf.h" + +gf PQCLEAN_MCELIECE460896_CLEAN_eval(gf * /*f*/, gf /*a*/); +void PQCLEAN_MCELIECE460896_CLEAN_root(gf * /*out*/, gf * /*f*/, gf * /*L*/); + +#endif + diff --git a/crypto_kem/mceliece460896/clean/sk_gen.c b/crypto_kem/mceliece460896/clean/sk_gen.c new file mode 100644 index 00000000..cbdc3c94 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE460896_CLEAN_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE460896_CLEAN_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE460896_CLEAN_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE460896_CLEAN_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE460896_CLEAN_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE460896_CLEAN_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE460896_CLEAN_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE460896_CLEAN_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896/clean/sk_gen.h b/crypto_kem/mceliece460896/clean/sk_gen.h new file mode 100644 index 00000000..e4cd90cf --- /dev/null +++ b/crypto_kem/mceliece460896/clean/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_SK_GEN_H +#define PQCLEAN_MCELIECE460896_CLEAN_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE460896_CLEAN_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE460896_CLEAN_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece460896/clean/synd.c b/crypto_kem/mceliece460896/clean/synd.c new file mode 100644 index 00000000..fc087567 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/synd.c @@ -0,0 +1,33 @@ +/* + This file is for syndrome computation +*/ + +#include "synd.h" + +#include "params.h" +#include "root.h" + + +/* input: Goppa polynomial f, support L, received word r */ +/* output: out, the syndrome of length 2t */ +void PQCLEAN_MCELIECE460896_CLEAN_synd(gf *out, gf *f, gf *L, const unsigned char *r) { + int i, j; + gf e, e_inv, c; + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = 0; + } + + for (i = 0; i < SYS_N; i++) { + c = (r[i / 8] >> (i % 8)) & 1; + + e = PQCLEAN_MCELIECE460896_CLEAN_eval(f, L[i]); + e_inv = PQCLEAN_MCELIECE460896_CLEAN_gf_inv(PQCLEAN_MCELIECE460896_CLEAN_gf_mul(e, e)); + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = PQCLEAN_MCELIECE460896_CLEAN_gf_add(out[j], PQCLEAN_MCELIECE460896_CLEAN_gf_mul(e_inv, c)); + e_inv = PQCLEAN_MCELIECE460896_CLEAN_gf_mul(e_inv, L[i]); + } + } +} + diff --git a/crypto_kem/mceliece460896/clean/synd.h b/crypto_kem/mceliece460896/clean/synd.h new file mode 100644 index 00000000..21264b36 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/synd.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_SYND_H +#define PQCLEAN_MCELIECE460896_CLEAN_SYND_H +/* + This file is for syndrome computation +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE460896_CLEAN_synd(gf * /*out*/, gf * /*f*/, gf * /*L*/, const unsigned char * /*r*/); + +#endif + diff --git a/crypto_kem/mceliece460896/clean/transpose.c b/crypto_kem/mceliece460896/clean/transpose.c new file mode 100644 index 00000000..935c2841 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/transpose.c @@ -0,0 +1,42 @@ +/* + This file is for matrix transposition +*/ + +#include "transpose.h" + +#include + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE460896_CLEAN_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + diff --git a/crypto_kem/mceliece460896/clean/transpose.h b/crypto_kem/mceliece460896/clean/transpose.h new file mode 100644 index 00000000..908cd2d2 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/transpose.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_TRANSPOSE_H +#define PQCLEAN_MCELIECE460896_CLEAN_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + +void PQCLEAN_MCELIECE460896_CLEAN_transpose_64x64(uint64_t * /*out*/, const uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece460896/clean/util.c b/crypto_kem/mceliece460896/clean/util.c new file mode 100644 index 00000000..ab333b5d --- /dev/null +++ b/crypto_kem/mceliece460896/clean/util.c @@ -0,0 +1,67 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ + +#include "util.h" + +#include "params.h" + +void PQCLEAN_MCELIECE460896_CLEAN_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE460896_CLEAN_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE460896_CLEAN_load4(const unsigned char *in) { + int i; + uint32_t ret = in[3]; + + for (i = 2; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +void PQCLEAN_MCELIECE460896_CLEAN_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE460896_CLEAN_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE460896_CLEAN_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 3; +} + diff --git a/crypto_kem/mceliece460896/clean/util.h b/crypto_kem/mceliece460896/clean/util.h new file mode 100644 index 00000000..fa66bb63 --- /dev/null +++ b/crypto_kem/mceliece460896/clean/util.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE460896_CLEAN_UTIL_H +#define PQCLEAN_MCELIECE460896_CLEAN_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include + +void PQCLEAN_MCELIECE460896_CLEAN_store2(unsigned char * /*dest*/, gf /*a*/); +uint16_t PQCLEAN_MCELIECE460896_CLEAN_load2(const unsigned char * /*src*/); + +uint32_t PQCLEAN_MCELIECE460896_CLEAN_load4(const unsigned char * /*in*/); + +void PQCLEAN_MCELIECE460896_CLEAN_store8(unsigned char * /*out*/, uint64_t /*in*/); +uint64_t PQCLEAN_MCELIECE460896_CLEAN_load8(const unsigned char * /*in*/); + +gf PQCLEAN_MCELIECE460896_CLEAN_bitrev(gf /*a*/); + +#endif + diff --git a/crypto_kem/mceliece460896/sse/LICENSE b/crypto_kem/mceliece460896/sse/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece460896/sse/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece460896/sse/Makefile b/crypto_kem/mceliece460896/sse/Makefile new file mode 100644 index 00000000..12ebde83 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/Makefile @@ -0,0 +1,37 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece460896_sse.a + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c operations.c pk_gen.c sk_gen.c util.c vec128.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S update_asm.S \ + vec128_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h params.h \ + pk_gen.h sk_gen.h transpose.h util.h vec128.h \ + consts.inc scalars_2x.inc scalars_4x.inc + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o operations.o pk_gen.o sk_gen.o util.o vec128.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + update_asm.o vec128_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mpopcnt -msse4.1 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece460896/sse/aes256ctr.c b/crypto_kem/mceliece460896/sse/aes256ctr.c new file mode 100644 index 00000000..46564503 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE460896_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece460896/sse/aes256ctr.h b/crypto_kem/mceliece460896/sse/aes256ctr.h new file mode 100644 index 00000000..5311dd5a --- /dev/null +++ b/crypto_kem/mceliece460896/sse/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_AES256CTR_H +#define PQCLEAN_MCELIECE460896_SSE_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE460896_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece460896/sse/api.h b/crypto_kem/mceliece460896/sse/api.h new file mode 100644 index 00000000..1a6dab5c --- /dev/null +++ b/crypto_kem/mceliece460896/sse/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_API_H +#define PQCLEAN_MCELIECE460896_SSE_API_H + +#include + +#define PQCLEAN_MCELIECE460896_SSE_CRYPTO_ALGNAME "Classic McEliece 460896" +#define PQCLEAN_MCELIECE460896_SSE_CRYPTO_PUBLICKEYBYTES 524160 +#define PQCLEAN_MCELIECE460896_SSE_CRYPTO_SECRETKEYBYTES 13568 +#define PQCLEAN_MCELIECE460896_SSE_CRYPTO_CIPHERTEXTBYTES 188 +#define PQCLEAN_MCELIECE460896_SSE_CRYPTO_BYTES 32 + +int PQCLEAN_MCELIECE460896_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE460896_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE460896_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece460896/sse/benes.c b/crypto_kem/mceliece460896/sse/benes.c new file mode 100644 index 00000000..9530e70b --- /dev/null +++ b/crypto_kem/mceliece460896/sse/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE460896_SSE_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE460896_SSE_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE460896_SSE_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE460896_SSE_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE460896_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE460896_SSE_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x(PQCLEAN_MCELIECE460896_SSE_load8(ptr), PQCLEAN_MCELIECE460896_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE460896_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE460896_SSE_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x(PQCLEAN_MCELIECE460896_SSE_load8(ptr), PQCLEAN_MCELIECE460896_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE460896_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE460896_SSE_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE460896_SSE_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE460896_SSE_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE460896_SSE_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE460896_SSE_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece460896/sse/benes.h b/crypto_kem/mceliece460896/sse/benes.h new file mode 100644 index 00000000..6d4fc8ce --- /dev/null +++ b/crypto_kem/mceliece460896/sse/benes.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_BENES_H +#define PQCLEAN_MCELIECE460896_SSE_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "gf.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE460896_SSE_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE460896_SSE_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece460896/sse/bm.c b/crypto_kem/mceliece460896/sse/bm.c new file mode 100644 index 00000000..de3e393e --- /dev/null +++ b/crypto_kem/mceliece460896/sse/bm.c @@ -0,0 +1,204 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +extern gf PQCLEAN_MCELIECE460896_SSE_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE460896_SSE_update_asm(vec128 *, gf); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline void vec128_cmov(vec128 *out, vec128 *in, uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE460896_SSE_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE460896_SSE_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE460896_SSE_vec128_and(in[i], m0); + v1 = PQCLEAN_MCELIECE460896_SSE_vec128_and(out[i], m1); + out[i] = PQCLEAN_MCELIECE460896_SSE_vec128_or(v0, v1); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE460896_SSE_vec128_or(PQCLEAN_MCELIECE460896_SSE_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE460896_SSE_vec128_sll_2x(PQCLEAN_MCELIECE460896_SSE_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE460896_SSE_vec128_or(PQCLEAN_MCELIECE460896_SSE_vec128_srl_2x(PQCLEAN_MCELIECE460896_SSE_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE460896_SSE_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE460896_SSE_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE460896_SSE_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE460896_SSE_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE460896_SSE_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE460896_SSE_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE460896_SSE_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE460896_SSE_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE460896_SSE_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE460896_SSE_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896_SSE_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896_SSE_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE460896_SSE_bm(vec128 *out, vec128 in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + uint64_t v[2]; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + vec128 dd[ GFBITS ], bb[ GFBITS ]; + vec128 B[ GFBITS ], C[ GFBITS ]; + vec128 B_tmp[ GFBITS ], C_tmp[ GFBITS ]; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[128], in[1]); + + C[0] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0, one << 63); + B[0] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0, one << 62); + + for (i = 1; i < GFBITS; i++) { + C[i] = B[i] = PQCLEAN_MCELIECE460896_SSE_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE460896_SSE_vec128_setzero(); + } + + for (N = 0; N < SYS_T * 2; N++) { + PQCLEAN_MCELIECE460896_SSE_update_asm(interval, coefs[N]); + PQCLEAN_MCELIECE460896_SSE_vec128_mul(prod, C, (vec128 *) interval); + d = PQCLEAN_MCELIECE460896_SSE_vec_reduce_asm(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[i] = PQCLEAN_MCELIECE460896_SSE_vec128_setbits((d >> i) & 1); + bb[i] = PQCLEAN_MCELIECE460896_SSE_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE460896_SSE_vec128_mul(B_tmp, dd, B); + PQCLEAN_MCELIECE460896_SSE_vec128_mul(C_tmp, bb, C); + + vec128_cmov(B, C, mask); + PQCLEAN_MCELIECE460896_SSE_update_asm(B, 0); + + for (i = 0; i < GFBITS; i++) { + C[i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(B_tmp[i], C_tmp[i]); + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(C[i], 0); + v[1] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(C[i], 1); + + out[i] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x((v[0] >> 31) | (v[1] << 33), v[1] >> 31); + } +} + diff --git a/crypto_kem/mceliece460896/sse/bm.h b/crypto_kem/mceliece460896/sse/bm.h new file mode 100644 index 00000000..015824f5 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/bm.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_BM_H +#define PQCLEAN_MCELIECE460896_SSE_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE460896_SSE_bm(vec128 * /*out*/, vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece460896/sse/consts.S b/crypto_kem/mceliece460896/sse/consts.S new file mode 100644 index 00000000..3156dbee --- /dev/null +++ b/crypto_kem/mceliece460896/sse/consts.S @@ -0,0 +1,32 @@ +.data + +# not supported on macos +#.section .rodata +.globl PQCLEAN_MCELIECE460896_SSE_MASK0_0 +.globl PQCLEAN_MCELIECE460896_SSE_MASK0_1 +.globl PQCLEAN_MCELIECE460896_SSE_MASK1_0 +.globl PQCLEAN_MCELIECE460896_SSE_MASK1_1 +.globl PQCLEAN_MCELIECE460896_SSE_MASK2_0 +.globl PQCLEAN_MCELIECE460896_SSE_MASK2_1 +.globl PQCLEAN_MCELIECE460896_SSE_MASK3_0 +.globl PQCLEAN_MCELIECE460896_SSE_MASK3_1 +.globl PQCLEAN_MCELIECE460896_SSE_MASK4_0 +.globl PQCLEAN_MCELIECE460896_SSE_MASK4_1 +.globl PQCLEAN_MCELIECE460896_SSE_MASK5_0 +.globl PQCLEAN_MCELIECE460896_SSE_MASK5_1 + +.p2align 4 + +PQCLEAN_MCELIECE460896_SSE_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE460896_SSE_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE460896_SSE_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE460896_SSE_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE460896_SSE_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE460896_SSE_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE460896_SSE_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE460896_SSE_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE460896_SSE_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE460896_SSE_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE460896_SSE_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE460896_SSE_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece460896/sse/consts.inc b/crypto_kem/mceliece460896/sse/consts.inc new file mode 100644 index 00000000..6dc9b4d1 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/consts.inc @@ -0,0 +1,967 @@ +// 64 +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +// 128 +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), +}, +// 256 +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9669966969966996, 0X6996699696699669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +// 512 +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +// 1024 +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +// 2048 +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +// 4096 +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +} diff --git a/crypto_kem/mceliece460896/sse/controlbits.c b/crypto_kem/mceliece460896/sse/controlbits.c new file mode 100644 index 00000000..ac7adfe7 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE460896_SSE_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE460896_SSE_sort_63b(n / 2, x); + PQCLEAN_MCELIECE460896_SSE_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE460896_SSE_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece460896/sse/controlbits.h b/crypto_kem/mceliece460896/sse/controlbits.h new file mode 100644 index 00000000..038a42b6 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_CONTROLBITS_H +#define PQCLEAN_MCELIECE460896_SSE_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE460896_SSE_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE460896_SSE_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece460896/sse/crypto_hash.h b/crypto_kem/mceliece460896/sse/crypto_hash.h new file mode 100644 index 00000000..df1c6228 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE460896_SSE_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece460896/sse/decrypt.c b/crypto_kem/mceliece460896/sse/decrypt.c new file mode 100644 index 00000000..af96cb64 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/decrypt.c @@ -0,0 +1,204 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec128 out[][GFBITS], vec128 inv[][GFBITS], const unsigned char *sk, vec128 *recv) { + int i, j; + + vec128 irr_int[ GFBITS ]; + vec128 eval[64][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE460896_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE460896_SSE_fft(eval, irr_int); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE460896_SSE_vec128_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE460896_SSE_vec128_copy(inv[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE460896_SSE_vec128_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896_SSE_vec128_inv(tmp, inv[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE460896_SSE_vec128_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE460896_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896_SSE_vec128_copy(inv[0], tmp); + + // + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE460896_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE460896_SSE_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE460896_SSE_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE460896_SSE_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec128 out[][GFBITS], vec128 inv[][GFBITS], vec128 *recv) { + int i, j; + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE460896_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE460896_SSE_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE460896_SSE_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec128 s0[][ GFBITS ], vec128 s1[][ GFBITS ]) { + int i, j; + vec128 diff; + + diff = PQCLEAN_MCELIECE460896_SSE_vec128_or(PQCLEAN_MCELIECE460896_SSE_vec128_xor(s0[0][0], s1[0][0]), + PQCLEAN_MCELIECE460896_SSE_vec128_xor(s0[1][0], s1[1][0])); + + for (i = 0; i < 2; i++) { + for (j = 1; j < GFBITS; j++) { + diff = PQCLEAN_MCELIECE460896_SSE_vec128_or(diff, PQCLEAN_MCELIECE460896_SSE_vec128_xor(s0[i][j], s1[i][j])); + } + } + + return (uint16_t)PQCLEAN_MCELIECE460896_SSE_vec128_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE460896_SSE_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec128 inv[ 64 ][ GFBITS ]; + vec128 scaled[ 64 ][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + + vec128 error[ 64 ]; + + vec128 s_priv[ 2 ][ GFBITS ]; + vec128 s_priv_cmp[ 2 ][ GFBITS ]; + + vec128 locator[ GFBITS ]; + + vec128 recv[ 64 ]; + vec128 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE460896_SSE_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE460896_SSE_benes(recv, bits_int, 1); + + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE460896_SSE_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE460896_SSE_bm(locator, s_priv); + + PQCLEAN_MCELIECE460896_SSE_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE460896_SSE_vec128_setbits(1); + + for (i = 0; i < 64; i++) { + error[i] = PQCLEAN_MCELIECE460896_SSE_vec128_or_reduce(eval[i]); + error[i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(error[i], allone); + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE460896_SSE_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE460896_SSE_benes(error, bits_int, 0); + + postprocess(e, error); + + check_weight = weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece460896/sse/decrypt.h b/crypto_kem/mceliece460896/sse/decrypt.h new file mode 100644 index 00000000..5e7e4566 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_DECRYPT_H +#define PQCLEAN_MCELIECE460896_SSE_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE460896_SSE_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece460896/sse/encrypt.c b/crypto_kem/mceliece460896/sse/encrypt.c new file mode 100644 index 00000000..14091872 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/encrypt.c @@ -0,0 +1,100 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE460896_SSE_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind[ SYS_T * 2 ]; + uint32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind32[i] == ind32[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + PQCLEAN_MCELIECE460896_SSE_store8(e, e_int[i]); + e += 8; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE460896_SSE_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE460896_SSE_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece460896/sse/encrypt.h b/crypto_kem/mceliece460896/sse/encrypt.h new file mode 100644 index 00000000..099f6aa2 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_ENCRYPT_H +#define PQCLEAN_MCELIECE460896_SSE_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE460896_SSE_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece460896/sse/fft.c b/crypto_kem/mceliece460896/sse/fft.c new file mode 100644 index 00000000..8b2aff65 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/fft.c @@ -0,0 +1,231 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec128.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE460896_SSE_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE460896_SSE_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE460896_SSE_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE460896_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE460896_SSE_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE460896_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE460896_SSE_vec128_mul(in, in, s[j]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec128 out[][ GFBITS ], const vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec128 t[ GFBITS ]; + vec128 pre[8][ GFBITS ]; + vec128 buf[64]; + + uint64_t v0, v1; + uint64_t consts_ptr = 1; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE460896_SSE_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre[i + 0][j] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_low(tmp[j], tmp[j]); + pre[i + 1][j] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x(PQCLEAN_MCELIECE460896_SSE_vec128_extract(in[i], 0), + PQCLEAN_MCELIECE460896_SSE_vec128_extract(in[i], 0) ^ PQCLEAN_MCELIECE460896_SSE_vec128_extract(pre[6][i], 0)); + + buf[1] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[0], pre[0][i]); + buf[16] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[0], pre[4][i]); + buf[3] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[1], pre[1][i]); + buf[48] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[16], pre[5][i]); + buf[49] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[48], pre[0][i]); + buf[2] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[0], pre[1][i]); + buf[51] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[49], pre[1][i]); + buf[6] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[2], pre[2][i]); + buf[50] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[51], pre[0][i]); + buf[7] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[6], pre[0][i]); + buf[54] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[50], pre[2][i]); + buf[5] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[7], pre[1][i]); + buf[55] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[54], pre[0][i]); + buf[53] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[55], pre[1][i]); + buf[4] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[0], pre[2][i]); + buf[52] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[53], pre[0][i]); + buf[12] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[4], pre[3][i]); + buf[60] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[52], pre[3][i]); + buf[13] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[12], pre[0][i]); + buf[61] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[60], pre[0][i]); + buf[15] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[13], pre[1][i]); + buf[63] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[61], pre[1][i]); + buf[14] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[15], pre[0][i]); + buf[62] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[63], pre[0][i]); + buf[10] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[14], pre[2][i]); + buf[58] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[62], pre[2][i]); + buf[11] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[10], pre[0][i]); + buf[59] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[58], pre[0][i]); + buf[9] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[11], pre[1][i]); + buf[57] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[59], pre[1][i]); + buf[56] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[57], pre[0][i]); + buf[8] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[0], pre[3][i]); + buf[40] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[56], pre[4][i]); + buf[24] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[8], pre[4][i]); + buf[41] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[40], pre[0][i]); + buf[25] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[24], pre[0][i]); + buf[43] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[41], pre[1][i]); + buf[27] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[25], pre[1][i]); + buf[42] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[43], pre[0][i]); + buf[26] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[27], pre[0][i]); + buf[46] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[42], pre[2][i]); + buf[30] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[26], pre[2][i]); + buf[47] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[46], pre[0][i]); + buf[31] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[30], pre[0][i]); + buf[45] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[47], pre[1][i]); + buf[29] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[31], pre[1][i]); + buf[44] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[45], pre[0][i]); + buf[28] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[29], pre[0][i]); + buf[36] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[44], pre[3][i]); + buf[20] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[28], pre[3][i]); + buf[37] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[36], pre[0][i]); + buf[21] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[20], pre[0][i]); + buf[39] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[37], pre[1][i]); + buf[23] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[21], pre[1][i]); + buf[38] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[39], pre[0][i]); + buf[22] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[23], pre[0][i]); + buf[34] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[38], pre[2][i]); + buf[18] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[22], pre[2][i]); + buf[35] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[34], pre[0][i]); + buf[19] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[18], pre[0][i]); + buf[33] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[35], pre[1][i]); + buf[17] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[19], pre[1][i]); + buf[32] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[33], pre[0][i]); + + PQCLEAN_MCELIECE460896_SSE_transpose_64x128_sp(buf); + + for (j = 0; j < 64; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 0; i <= 5; i++) { + s = 1 << i; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE460896_SSE_vec128_mul(tmp, out[k + s], (vec128 *) consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(out[k ][b], tmp[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(out[k + s][b], out[k][b]); + } + } + } + + consts_ptr += (1 << i); + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE460896_SSE_fft(vec128 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece460896/sse/fft.h b/crypto_kem/mceliece460896/sse/fft.h new file mode 100644 index 00000000..8d5f6004 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_FFT_H +#define PQCLEAN_MCELIECE460896_SSE_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include + +void PQCLEAN_MCELIECE460896_SSE_fft(vec128 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece460896/sse/fft_tr.c b/crypto_kem/mceliece460896/sse/fft_tr.c new file mode 100644 index 00000000..73bd36b4 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/fft_tr.c @@ -0,0 +1,354 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec128 in[][ GFBITS ]) { + int i, j, k; + vec128 t, x0, x1; + + uint64_t v0, v1, v2, v3; + + const vec128 mask[6][2] = { + { + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec128 s[6][2][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE460896_SSE_vec128_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE460896_SSE_vec128_mul(in[1], in[1], s[j][1]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE460896_SSE_vec128_and(in[0][i], mask[k][0]); + t = PQCLEAN_MCELIECE460896_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE460896_SSE_vec128_and(in[0][i], mask[k][1]); + t = PQCLEAN_MCELIECE460896_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE460896_SSE_vec128_and(in[1][i], mask[k][0]); + t = PQCLEAN_MCELIECE460896_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[1][i], t); + + t = PQCLEAN_MCELIECE460896_SSE_vec128_and(in[1][i], mask[k][1]); + t = PQCLEAN_MCELIECE460896_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[1][i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + x0 = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_low(in[0][i], in[1][i]); + x1 = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_high(in[0][i], in[1][i]); + + x1 = PQCLEAN_MCELIECE460896_SSE_vec128_xor(x1, PQCLEAN_MCELIECE460896_SSE_vec128_srl_2x(x0, 32)); + x1 = PQCLEAN_MCELIECE460896_SSE_vec128_xor(x1, PQCLEAN_MCELIECE460896_SSE_vec128_sll_2x(x1, 32)); + + in[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_low(x0, x1); + in[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_high(x0, x1); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE460896_SSE_vec128_extract(in[0][i], 0); + v1 = PQCLEAN_MCELIECE460896_SSE_vec128_extract(in[0][i], 1); + v2 = PQCLEAN_MCELIECE460896_SSE_vec128_extract(in[1][i], 0); + v3 = PQCLEAN_MCELIECE460896_SSE_vec128_extract(in[1][i], 1); + + v3 ^= v2 ^= v1; + + in[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x(v0, v1); + in[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x(v2, v3); + } + + } +} + +static void butterflies_tr(vec128 out[][ GFBITS ], vec128 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec128 tmp0[ GFBITS ]; + vec128 tmp1[ GFBITS ]; + vec128 tmp[ GFBITS ]; + + vec128 pre[ 6 ][ GFBITS ]; + vec128 buf[ 64 ]; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 64; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 5; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[k][b], in[k + s][b]); + } + + PQCLEAN_MCELIECE460896_SSE_vec128_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[k + s][b], tmp[b]); + } + } + } + } + + for (j = 0; j < 64; j += 2) { + for (i = 0; i < GFBITS; i++) { + tmp0[i] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_low(in[j][i], in[j + 1][i]); + tmp1[i] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_high(in[j][i], in[j + 1][i]); + } + + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(tmp0[b], tmp1[b]); + } + + PQCLEAN_MCELIECE460896_SSE_vec128_mul(tmp, tmp0, consts[0]); + + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(tmp1[b], tmp[b]); + } + + for (i = 0; i < GFBITS; i++) { + in[j + 0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_low(tmp0[i], tmp1[i]); + in[j + 1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_unpack_high(tmp0[i], tmp1[i]); + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 64; k++) { + buf[k] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE460896_SSE_transpose_64x128_sp(buf); + + pre[0][i] = buf[32]; + buf[33] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[33], buf[32]); + pre[1][i] = buf[33]; + buf[35] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[35], buf[33]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[35]); + buf[34] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[34], buf[35]); + pre[2][i] = buf[34]; + buf[38] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[38], buf[34]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[38]); + buf[39] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[39], buf[38]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[39]); + buf[37] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[37], buf[39]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[37]); + buf[36] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[36], buf[37]); + pre[3][i] = buf[36]; + buf[44] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[44], buf[36]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[44]); + buf[45] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[45], buf[44]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[45]); + buf[47] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[47], buf[45]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[47]); + buf[46] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[46], buf[47]); + pre[2][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[2][i], buf[46]); + buf[42] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[42], buf[46]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[42]); + buf[43] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[43], buf[42]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[43]); + buf[41] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[41], buf[43]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[41]); + buf[40] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[40], buf[41]); + pre[4][i] = buf[40]; + buf[56] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[56], buf[40]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[56]); + buf[57] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[57], buf[56]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[57]); + buf[59] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[59], buf[57]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[59]); + buf[58] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[58], buf[59]); + pre[2][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[2][i], buf[58]); + buf[62] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[62], buf[58]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[62]); + buf[63] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[63], buf[62]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[63]); + buf[61] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[61], buf[63]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[61]); + buf[60] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[60], buf[61]); + pre[3][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[3][i], buf[60]); + buf[52] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[52], buf[60]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[52]); + buf[53] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[53], buf[52]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[53]); + buf[55] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[55], buf[53]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[55]); + buf[54] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[54], buf[55]); + pre[2][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[2][i], buf[54]); + buf[50] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[50], buf[54]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[50]); + buf[51] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[51], buf[50]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[51]); + buf[49] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[49], buf[51]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[49]); + buf[48] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[48], buf[49]); + pre[5][i] = buf[48]; + buf[16] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[16], buf[48]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[16]); + buf[17] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[17], buf[16]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[17]); + buf[19] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[19], buf[17]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[19]); + buf[18] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[18], buf[19]); + pre[2][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[2][i], buf[18]); + buf[22] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[22], buf[18]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[22]); + buf[23] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[23], buf[22]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[23]); + buf[21] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[21], buf[23]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[21]); + buf[20] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[20], buf[21]); + pre[3][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[3][i], buf[20]); + buf[28] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[28], buf[20]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[28]); + buf[29] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[29], buf[28]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[29]); + buf[31] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[31], buf[29]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[31]); + buf[30] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[30], buf[31]); + pre[2][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[2][i], buf[30]); + buf[26] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[26], buf[30]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[26]); + buf[27] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[27], buf[26]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[27]); + buf[25] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[25], buf[27]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[25]); + buf[24] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[24], buf[25]); + pre[4][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[4][i], buf[24]); + buf[8] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[8], buf[24]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[8]); + buf[9] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[9], buf[8]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[9]); + buf[11] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[11], buf[9]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[11]); + buf[10] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[10], buf[11]); + pre[2][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[2][i], buf[10]); + buf[14] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[14], buf[10]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[14]); + buf[15] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[15], buf[14]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[15]); + buf[13] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[13], buf[15]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[13]); + buf[12] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[12], buf[13]); + pre[3][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[3][i], buf[12]); + buf[4] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[4], buf[12]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[4]); + buf[5] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[5], buf[4]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[5]); + buf[7] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[7], buf[5]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[7]); + buf[6] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[6], buf[7]); + pre[2][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[2][i], buf[6]); + buf[2] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[2], buf[6]); + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[2]); + buf[3] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[3], buf[2]); + pre[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[1][i], buf[3]); + buf[1] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[1], buf[3]); + + pre[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(pre[0][i], buf[1]); + out[0][i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(buf[0], buf[1]); + + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896_SSE_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE460896_SSE_vec128_mul(out[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896_SSE_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE460896_SSE_vec128_mul(tmp, pre[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out[1][b] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(out[1][b], tmp[b]); + } + } +} + +/* justifying the length of the output */ +static void postprocess(vec128 out[][GFBITS]) { + int i; + uint64_t v[2]; + + for (i = 0; i < 13; i++) { + v[0] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(out[1][i], 0); + v[1] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(out[1][i], 1); + + v[1] = 0; + + out[1][i] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE460896_SSE_fft_tr(vec128 out[][GFBITS], vec128 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/crypto_kem/mceliece460896/sse/fft_tr.h b/crypto_kem/mceliece460896/sse/fft_tr.h new file mode 100644 index 00000000..abb77479 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_FFT_TR_H +#define PQCLEAN_MCELIECE460896_SSE_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE460896_SSE_fft_tr(vec128 /*out*/[][GFBITS], vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece460896/sse/gf.c b/crypto_kem/mceliece460896/sse/gf.c new file mode 100644 index 00000000..34f9b754 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/gf.c @@ -0,0 +1,205 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE460896_SSE_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE460896_SSE_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE460896_SSE_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE460896_SSE_gf_inv(gf in) { + return PQCLEAN_MCELIECE460896_SSE_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE460896_SSE_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[191]; + + for (i = 0; i < 191; i++) { + prod[i] = 0; + } + + for (i = 0; i < 96; i++) { + for (j = 0; j < 96; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE460896_SSE_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 190; i >= 96; i--) { + prod[i - 85] ^= PQCLEAN_MCELIECE460896_SSE_gf_mul(prod[i], (gf) 714); + prod[i - 91] ^= PQCLEAN_MCELIECE460896_SSE_gf_mul(prod[i], (gf) 5296); + prod[i - 92] ^= PQCLEAN_MCELIECE460896_SSE_gf_mul(prod[i], (gf) 728); + prod[i - 96] ^= PQCLEAN_MCELIECE460896_SSE_gf_mul(prod[i], (gf) 5881); + } + + for (i = 0; i < 96; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece460896/sse/gf.h b/crypto_kem/mceliece460896/sse/gf.h new file mode 100644 index 00000000..bc1c60d3 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_GF_H +#define PQCLEAN_MCELIECE460896_SSE_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE460896_SSE_gf_iszero(gf a); +gf PQCLEAN_MCELIECE460896_SSE_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE460896_SSE_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE460896_SSE_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE460896_SSE_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE460896_SSE_gf_inv(gf in); + +void PQCLEAN_MCELIECE460896_SSE_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece460896/sse/operations.c b/crypto_kem/mceliece460896/sse/operations.c new file mode 100644 index 00000000..c7e0f688 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE460896_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE460896_SSE_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE460896_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE460896_SSE_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE460896_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE460896_SSE_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE460896_SSE_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE460896_SSE_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE460896_SSE_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE460896_SSE_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE460896_SSE_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE460896_SSE_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE460896_SSE_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896/sse/params.h b/crypto_kem/mceliece460896/sse/params.h new file mode 100644 index 00000000..1262257d --- /dev/null +++ b/crypto_kem/mceliece460896/sse/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_PARAMS_H +#define PQCLEAN_MCELIECE460896_SSE_PARAMS_H + +#define GFBITS 13 +#define SYS_N 4608 +#define SYS_T 96 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece460896/sse/pk_gen.c b/crypto_kem/mceliece460896/sse/pk_gen.c new file mode 100644 index 00000000..4c31efb1 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/pk_gen.c @@ -0,0 +1,273 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec128 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE460896_SSE_vec128_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 128 + 0 * 64 + r] <<= 1; + out[i * 128 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896_SSE_vec128_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 128 + 1 * 64 + r] <<= 1; + out[i * 128 + 1 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec128 out0[][GFBITS], vec128 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[2] = {0}; + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x(u[0], u[1]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x(u[0], u[1]); + } + } +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 127) / 128) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +#define NBLOCKS2_I ((GFBITS * SYS_T + 127) / 128) +int PQCLEAN_MCELIECE460896_SSE_pk_gen(unsigned char *pk, uint32_t *perm, const uint8_t *sk) { + const int block_idx = NBLOCKS1_I - 1; + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 2 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS1_I ]; + + uint64_t mask; + + vec128 irr_int[ GFBITS ]; + + vec128 consts[64][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + vec128 prod[ 64 ][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE460896_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE460896_SSE_fft(eval, irr_int); + + PQCLEAN_MCELIECE460896_SSE_vec128_copy(prod[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE460896_SSE_vec128_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896_SSE_vec128_inv(tmp, prod[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE460896_SSE_vec128_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE460896_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896_SSE_vec128_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE460896_SSE_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_I; j++) { + PQCLEAN_MCELIECE460896_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS1_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + uint64_t column[ PK_NROWS ]; + + for (i = 0; i < PK_NROWS; i++) { + column[i] = mat[ i ][ block_idx ]; + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE460896_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE460896_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + + for (i = 0; i < PK_NROWS; i++) { + mat[ i ][ block_idx ] = column[i]; + } + + for (row = 0; row < PK_NROWS; row++) { + + for (k = 0; k < NBLOCKS1_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + one_row[k] = (one_row[k] >> tail) | (one_row[k + 1] << (64 - tail)); + PQCLEAN_MCELIECE460896_SSE_store8(pk, one_row[k]); + pk += 8; + } + + one_row[k] >>= tail; + PQCLEAN_MCELIECE460896_SSE_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece460896/sse/pk_gen.h b/crypto_kem/mceliece460896/sse/pk_gen.h new file mode 100644 index 00000000..80d813a0 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_PK_GEN_H +#define PQCLEAN_MCELIECE460896_SSE_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE460896_SSE_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece460896/sse/scalars_2x.inc b/crypto_kem/mceliece460896/sse/scalars_2x.inc new file mode 100644 index 00000000..ab5fb1ce --- /dev/null +++ b/crypto_kem/mceliece460896/sse/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece460896/sse/scalars_4x.inc b/crypto_kem/mceliece460896/sse/scalars_4x.inc new file mode 100644 index 00000000..ba28f053 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/scalars_4x.inc @@ -0,0 +1,181 @@ +{{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF), +}}, +{{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FF00000000FF, 0X0000FF000000FFFF), +}}, +{{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), +}}, +{{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), +}} + diff --git a/crypto_kem/mceliece460896/sse/sk_gen.c b/crypto_kem/mceliece460896/sse/sk_gen.c new file mode 100644 index 00000000..539b849e --- /dev/null +++ b/crypto_kem/mceliece460896/sse/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE460896_SSE_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE460896_SSE_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE460896_SSE_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE460896_SSE_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE460896_SSE_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE460896_SSE_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE460896_SSE_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE460896_SSE_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896/sse/sk_gen.h b/crypto_kem/mceliece460896/sse/sk_gen.h new file mode 100644 index 00000000..039d1b54 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_SK_GEN_H +#define PQCLEAN_MCELIECE460896_SSE_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE460896_SSE_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE460896_SSE_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece460896/sse/syndrome_asm.S b/crypto_kem/mceliece460896/sse/syndrome_asm.S new file mode 100644 index 00000000..eabe2a0a --- /dev/null +++ b/crypto_kem/mceliece460896/sse/syndrome_asm.S @@ -0,0 +1,960 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg128 pp + +# qhasm: reg128 ee + +# qhasm: reg128 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack128 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896_SSE_syndrome_asm +.global PQCLEAN_MCELIECE460896_SSE_syndrome_asm +_PQCLEAN_MCELIECE460896_SSE_syndrome_asm: +PQCLEAN_MCELIECE460896_SSE_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 523740 +# asm 1: add $523740,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1248 +# asm 1: mov $1248,>row=int64#5 +# asm 2: mov $1248,>row=%r8 +mov $1248,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rsi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 156 ] +# asm 1: movdqu 156(ee=reg128#2 +# asm 2: movdqu 156(ee=%xmm1 +movdqu 156(%rdx),%xmm1 + +# qhasm: ss &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 16(pp=%xmm1 +movdqu 16(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 172 ] +# asm 1: movdqu 172(ee=reg128#3 +# asm 2: movdqu 172(ee=%xmm2 +movdqu 172(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 32(pp=%xmm1 +movdqu 32(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 188 ] +# asm 1: movdqu 188(ee=reg128#3 +# asm 2: movdqu 188(ee=%xmm2 +movdqu 188(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 48(pp=%xmm1 +movdqu 48(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 204 ] +# asm 1: movdqu 204(ee=reg128#3 +# asm 2: movdqu 204(ee=%xmm2 +movdqu 204(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 64(pp=%xmm1 +movdqu 64(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 220 ] +# asm 1: movdqu 220(ee=reg128#3 +# asm 2: movdqu 220(ee=%xmm2 +movdqu 220(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 80(pp=%xmm1 +movdqu 80(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 236 ] +# asm 1: movdqu 236(ee=reg128#3 +# asm 2: movdqu 236(ee=%xmm2 +movdqu 236(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 96(pp=%xmm1 +movdqu 96(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 252 ] +# asm 1: movdqu 252(ee=reg128#3 +# asm 2: movdqu 252(ee=%xmm2 +movdqu 252(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 112(pp=%xmm1 +movdqu 112(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 268 ] +# asm 1: movdqu 268(ee=reg128#3 +# asm 2: movdqu 268(ee=%xmm2 +movdqu 268(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 128(pp=%xmm1 +movdqu 128(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 284 ] +# asm 1: movdqu 284(ee=reg128#3 +# asm 2: movdqu 284(ee=%xmm2 +movdqu 284(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 144(pp=%xmm1 +movdqu 144(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 300 ] +# asm 1: movdqu 300(ee=reg128#3 +# asm 2: movdqu 300(ee=%xmm2 +movdqu 300(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 160(pp=%xmm1 +movdqu 160(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 316 ] +# asm 1: movdqu 316(ee=reg128#3 +# asm 2: movdqu 316(ee=%xmm2 +movdqu 316(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 176(pp=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 332 ] +# asm 1: movdqu 332(ee=reg128#3 +# asm 2: movdqu 332(ee=%xmm2 +movdqu 332(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 192(pp=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 348 ] +# asm 1: movdqu 348(ee=reg128#3 +# asm 2: movdqu 348(ee=%xmm2 +movdqu 348(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 208(pp=%xmm1 +movdqu 208(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 364 ] +# asm 1: movdqu 364(ee=reg128#3 +# asm 2: movdqu 364(ee=%xmm2 +movdqu 364(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 224(pp=%xmm1 +movdqu 224(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 380 ] +# asm 1: movdqu 380(ee=reg128#3 +# asm 2: movdqu 380(ee=%xmm2 +movdqu 380(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 240(pp=%xmm1 +movdqu 240(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 396 ] +# asm 1: movdqu 396(ee=reg128#3 +# asm 2: movdqu 396(ee=%xmm2 +movdqu 396(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 256(pp=%xmm1 +movdqu 256(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 412 ] +# asm 1: movdqu 412(ee=reg128#3 +# asm 2: movdqu 412(ee=%xmm2 +movdqu 412(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 272(pp=%xmm1 +movdqu 272(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 428 ] +# asm 1: movdqu 428(ee=reg128#3 +# asm 2: movdqu 428(ee=%xmm2 +movdqu 428(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 288(pp=%xmm1 +movdqu 288(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 444 ] +# asm 1: movdqu 444(ee=reg128#3 +# asm 2: movdqu 444(ee=%xmm2 +movdqu 444(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 304(pp=%xmm1 +movdqu 304(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 460 ] +# asm 1: movdqu 460(ee=reg128#3 +# asm 2: movdqu 460(ee=%xmm2 +movdqu 460(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 320(pp=%xmm1 +movdqu 320(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 476 ] +# asm 1: movdqu 476(ee=reg128#3 +# asm 2: movdqu 476(ee=%xmm2 +movdqu 476(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 336(pp=%xmm1 +movdqu 336(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 492 ] +# asm 1: movdqu 492(ee=reg128#3 +# asm 2: movdqu 492(ee=%xmm2 +movdqu 492(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 352(pp=%xmm1 +movdqu 352(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 508 ] +# asm 1: movdqu 508(ee=reg128#3 +# asm 2: movdqu 508(ee=%xmm2 +movdqu 508(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 368(pp=%xmm1 +movdqu 368(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 524 ] +# asm 1: movdqu 524(ee=reg128#3 +# asm 2: movdqu 524(ee=%xmm2 +movdqu 524(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 384(pp=%xmm1 +movdqu 384(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 540 ] +# asm 1: movdqu 540(ee=reg128#3 +# asm 2: movdqu 540(ee=%xmm2 +movdqu 540(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 400(pp=%xmm1 +movdqu 400(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 556 ] +# asm 1: movdqu 556(ee=reg128#3 +# asm 2: movdqu 556(ee=%xmm2 +movdqu 556(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand buf=stack128#1 +# asm 2: movdqa buf=0(%rsp) +movdqa %xmm0,0(%rsp) + +# qhasm: s = *(uint32 *)(input_1 + 416) +# asm 1: movl 416(s=int64#6d +# asm 2: movl 416(s=%r9d +movl 416(%rsi),%r9d + +# qhasm: e = *(uint32 *)(input_2 + 572) +# asm 1: movl 572(e=int64#7d +# asm 2: movl 572(e=%eax +movl 572(%rdx),%eax + +# qhasm: s &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(ee=reg128#2 +# asm 2: movdqu 0(ee=%xmm1 +movdqu 0(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 16(ss=%xmm0 +movdqu 16(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 16 ] +# asm 1: movdqu 16(ee=reg128#2 +# asm 2: movdqu 16(ee=%xmm1 +movdqu 16(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 32(ss=%xmm0 +movdqu 32(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 32 ] +# asm 1: movdqu 32(ee=reg128#2 +# asm 2: movdqu 32(ee=%xmm1 +movdqu 32(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 48(ss=%xmm0 +movdqu 48(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 48 ] +# asm 1: movdqu 48(ee=reg128#2 +# asm 2: movdqu 48(ee=%xmm1 +movdqu 48(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 64(ss=%xmm0 +movdqu 64(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 64 ] +# asm 1: movdqu 64(ee=reg128#2 +# asm 2: movdqu 64(ee=%xmm1 +movdqu 64(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 80(ss=%xmm0 +movdqu 80(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 80 ] +# asm 1: movdqu 80(ee=reg128#2 +# asm 2: movdqu 80(ee=%xmm1 +movdqu 80(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 96(ss=%xmm0 +movdqu 96(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 96 ] +# asm 1: movdqu 96(ee=reg128#2 +# asm 2: movdqu 96(ee=%xmm1 +movdqu 96(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 112(ss=%xmm0 +movdqu 112(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 112 ] +# asm 1: movdqu 112(ee=reg128#2 +# asm 2: movdqu 112(ee=%xmm1 +movdqu 112(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor s=int64#2 +# asm 2: movq 128(s=%rsi +movq 128(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 128 ] +# asm 1: movq 128(e=int64#4 +# asm 2: movq 128(e=%rcx +movq 128(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 136(s=%rsi +movq 136(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 136 ] +# asm 1: movq 136(e=int64#4 +# asm 2: movq 136(e=%rcx +movq 136(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 144(s=%rsi +movq 144(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 144 ] +# asm 1: movq 144(e=int64#4 +# asm 2: movq 144(e=%rcx +movq 144(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2d +# asm 2: movl 152(s=%esi +movl 152(%rdi),%esi + +# qhasm: e = *(uint32 *)( input_2 + 152 ) +# asm 1: movl 152(e=int64#3d +# asm 2: movl 152(e=%edx +movl 152(%rdx),%edx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE460896_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE460896_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE460896_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE460896_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE460896_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE460896_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE460896_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE460896_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE460896_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE460896_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE460896_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE460896_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE460896_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE460896_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE460896_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE460896_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE460896_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE460896_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE460896_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE460896_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE460896_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE460896_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE460896_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE460896_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE460896_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE460896_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE460896_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE460896_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE460896_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE460896_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE460896_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE460896_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE460896_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE460896_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#3 +# asm 2: movq 0(s0=%rdx +movq 0(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#4 +# asm 2: movq 8(s1=%rcx +movq 8(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 16(s0=%rdx +movq 16(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(s1=int64#4 +# asm 2: movq 24(s1=%rcx +movq 24(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 32(s0=%rdx +movq 32(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(s1=int64#4 +# asm 2: movq 40(s1=%rcx +movq 40(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 48(s0=%rdx +movq 48(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(s1=int64#4 +# asm 2: movq 56(s1=%rcx +movq 56(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 64(s0=%rdx +movq 64(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(s1=int64#4 +# asm 2: movq 72(s1=%rcx +movq 72(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 80(s0=%rdx +movq 80(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(s1=int64#4 +# asm 2: movq 88(s1=%rcx +movq 88(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 96(s0=%rdx +movq 96(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(s1=int64#4 +# asm 2: movq 104(s1=%rcx +movq 104(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 112(s0=%rdx +movq 112(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(s1=int64#4 +# asm 2: movq 120(s1=%rcx +movq 120(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 128(s0=%rdx +movq 128(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(s1=int64#4 +# asm 2: movq 136(s1=%rcx +movq 136(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 144(s0=%rdx +movq 144(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(s1=int64#4 +# asm 2: movq 152(s1=%rcx +movq 152(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 160(s0=%rdx +movq 160(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(s1=int64#4 +# asm 2: movq 168(s1=%rcx +movq 168(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 176(s0=%rdx +movq 176(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(s1=int64#4 +# asm 2: movq 184(s1=%rcx +movq 184(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 192(s0=%rdx +movq 192(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(s1=int64#4 +# asm 2: movq 200(s1=%rcx +movq 200(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE460896_SSE_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE460896_SSE_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE460896_SSE_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE460896_SSE_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE460896_SSE_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE460896_SSE_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE460896_SSE_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE460896_SSE_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE460896_SSE_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE460896_SSE_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE460896_SSE_vec128_set2x( PQCLEAN_MCELIECE460896_SSE_load8(in), PQCLEAN_MCELIECE460896_SSE_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE460896_SSE_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE460896_SSE_store8(out + 0, PQCLEAN_MCELIECE460896_SSE_vec128_extract(in, 0)); + PQCLEAN_MCELIECE460896_SSE_store8(out + 8, PQCLEAN_MCELIECE460896_SSE_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece460896/sse/util.h b/crypto_kem/mceliece460896/sse/util.h new file mode 100644 index 00000000..312e1ed1 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_UTIL_H +#define PQCLEAN_MCELIECE460896_SSE_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE460896_SSE_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE460896_SSE_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE460896_SSE_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE460896_SSE_load4(const unsigned char *src); +void PQCLEAN_MCELIECE460896_SSE_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE460896_SSE_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE460896_SSE_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE460896_SSE_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE460896_SSE_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece460896/sse/vec128.c b/crypto_kem/mceliece460896/sse/vec128.c new file mode 100644 index 00000000..ff68a501 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/vec128.c @@ -0,0 +1,152 @@ +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + +#include "vec128.h" + +#include "params.h" + + +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE460896_SSE_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE460896_SSE_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE460896_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE460896_SSE_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE460896_SSE_vec128_mul_asm(h, f, g, 16); +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE460896_SSE_vec128_sq(vec128 *out, const vec128 *in) { + int i; + vec128 result[GFBITS], t; + + t = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[11], in[12]); + + result[0] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[0], in[11]); + result[1] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[7], t); + result[2] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[1], in[7]); + result[3] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[8], t); + result[4] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[2], in[7]); + result[4] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(result[4], in[8]); + result[4] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(result[4], t); + result[5] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[7], in[9]); + result[6] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[3], in[8]); + result[6] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(result[6], in[9]); + result[6] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(result[6], in[12]); + result[7] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[8], in[10]); + result[8] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[4], in[9]); + result[8] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(result[8], in[10]); + result[9] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[9], in[11]); + result[10] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[5], in[10]); + result[10] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(result[10], in[11]); + result[11] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[10], in[12]); + result[12] = PQCLEAN_MCELIECE460896_SSE_vec128_xor(in[6], t); + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE460896_SSE_vec128_inv(vec128 *out, const vec128 *in) { + vec128 tmp_11[ GFBITS ]; + vec128 tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE460896_SSE_vec128_copy(out, in); + + PQCLEAN_MCELIECE460896_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896_SSE_vec128_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE460896_SSE_vec128_sq(out, tmp_11); + PQCLEAN_MCELIECE460896_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896_SSE_vec128_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE460896_SSE_vec128_sq(out, tmp_1111); + PQCLEAN_MCELIECE460896_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896_SSE_vec128_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE460896_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896_SSE_vec128_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE460896_SSE_vec128_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece460896/sse/vec128.h b/crypto_kem/mceliece460896/sse/vec128.h new file mode 100644 index 00000000..7bcb1977 --- /dev/null +++ b/crypto_kem/mceliece460896/sse/vec128.h @@ -0,0 +1,43 @@ +#ifndef PQCLEAN_MCELIECE460896_SSE_VEC128_H +#define PQCLEAN_MCELIECE460896_SSE_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE460896_SSE_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE460896_SSE_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE460896_SSE_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE460896_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE460896_SSE_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE460896_SSE_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +void PQCLEAN_MCELIECE460896_SSE_vec128_sq(vec128 *out, const vec128 *in); +void PQCLEAN_MCELIECE460896_SSE_vec128_inv(vec128 *out, const vec128 *in); +#endif diff --git a/crypto_kem/mceliece460896/sse/vec128_mul_asm.S b/crypto_kem/mceliece460896/sse/vec128_mul_asm.S new file mode 100644 index 00000000..0d48cbaf --- /dev/null +++ b/crypto_kem/mceliece460896/sse/vec128_mul_asm.S @@ -0,0 +1,2127 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 a0 + +# qhasm: reg128 a1 + +# qhasm: reg128 a2 + +# qhasm: reg128 a3 + +# qhasm: reg128 a4 + +# qhasm: reg128 a5 + +# qhasm: reg128 a6 + +# qhasm: reg128 a7 + +# qhasm: reg128 a8 + +# qhasm: reg128 a9 + +# qhasm: reg128 a10 + +# qhasm: reg128 a11 + +# qhasm: reg128 a12 + +# qhasm: reg128 b0 + +# qhasm: reg128 b1 + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 r5 + +# qhasm: reg128 r6 + +# qhasm: reg128 r7 + +# qhasm: reg128 r8 + +# qhasm: reg128 r9 + +# qhasm: reg128 r10 + +# qhasm: reg128 r11 + +# qhasm: reg128 r12 + +# qhasm: reg128 r13 + +# qhasm: reg128 r14 + +# qhasm: reg128 r15 + +# qhasm: reg128 r16 + +# qhasm: reg128 r17 + +# qhasm: reg128 r18 + +# qhasm: reg128 r19 + +# qhasm: reg128 r20 + +# qhasm: reg128 r21 + +# qhasm: reg128 r22 + +# qhasm: reg128 r23 + +# qhasm: reg128 r24 + +# qhasm: reg128 r + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896_SSE_vec128_mul_asm +.global PQCLEAN_MCELIECE460896_SSE_vec128_mul_asm +_PQCLEAN_MCELIECE460896_SSE_vec128_mul_asm: +PQCLEAN_MCELIECE460896_SSE_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(b0=reg128#1 +# asm 2: movdqu 0(b0=%xmm0 +movdqu 0(%rdx),%xmm0 + +# qhasm: a12 = mem128[ input_1 + 192 ] +# asm 1: movdqu 192(a12=reg128#2 +# asm 2: movdqu 192(a12=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg128#3 +# asm 2: vpand r12=%xmm2 +vpand %xmm0,%xmm1,%xmm2 + +# qhasm: r13 = a12 & mem128[input_2 + 16] +# asm 1: vpand 16(r13=reg128#4 +# asm 2: vpand 16(r13=%xmm3 +vpand 16(%rdx),%xmm1,%xmm3 + +# qhasm: r14 = a12 & mem128[input_2 + 32] +# asm 1: vpand 32(r14=reg128#5 +# asm 2: vpand 32(r14=%xmm4 +vpand 32(%rdx),%xmm1,%xmm4 + +# qhasm: r15 = a12 & mem128[input_2 + 48] +# asm 1: vpand 48(r15=reg128#6 +# asm 2: vpand 48(r15=%xmm5 +vpand 48(%rdx),%xmm1,%xmm5 + +# qhasm: r16 = a12 & mem128[input_2 + 64] +# asm 1: vpand 64(r16=reg128#7 +# asm 2: vpand 64(r16=%xmm6 +vpand 64(%rdx),%xmm1,%xmm6 + +# qhasm: r17 = a12 & mem128[input_2 + 80] +# asm 1: vpand 80(r17=reg128#8 +# asm 2: vpand 80(r17=%xmm7 +vpand 80(%rdx),%xmm1,%xmm7 + +# qhasm: r18 = a12 & mem128[input_2 + 96] +# asm 1: vpand 96(r18=reg128#9 +# asm 2: vpand 96(r18=%xmm8 +vpand 96(%rdx),%xmm1,%xmm8 + +# qhasm: r19 = a12 & mem128[input_2 + 112] +# asm 1: vpand 112(r19=reg128#10 +# asm 2: vpand 112(r19=%xmm9 +vpand 112(%rdx),%xmm1,%xmm9 + +# qhasm: r20 = a12 & mem128[input_2 + 128] +# asm 1: vpand 128(r20=reg128#11 +# asm 2: vpand 128(r20=%xmm10 +vpand 128(%rdx),%xmm1,%xmm10 + +# qhasm: r21 = a12 & mem128[input_2 + 144] +# asm 1: vpand 144(r21=reg128#12 +# asm 2: vpand 144(r21=%xmm11 +vpand 144(%rdx),%xmm1,%xmm11 + +# qhasm: r22 = a12 & mem128[input_2 + 160] +# asm 1: vpand 160(r22=reg128#13 +# asm 2: vpand 160(r22=%xmm12 +vpand 160(%rdx),%xmm1,%xmm12 + +# qhasm: r23 = a12 & mem128[input_2 + 176] +# asm 1: vpand 176(r23=reg128#14 +# asm 2: vpand 176(r23=%xmm13 +vpand 176(%rdx),%xmm1,%xmm13 + +# qhasm: r24 = a12 & mem128[input_2 + 192] +# asm 1: vpand 192(r24=reg128#2 +# asm 2: vpand 192(r24=%xmm1 +vpand 192(%rdx),%xmm1,%xmm1 + +# qhasm: r15 ^= r24 +# asm 1: pxor r11=reg128#2 +# asm 2: movdqa r11=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: a11 = mem128[ input_1 + 176 ] +# asm 1: movdqu 176(a11=reg128#15 +# asm 2: movdqu 176(a11=%xmm14 +movdqu 176(%rsi),%xmm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r22 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r23 ^= r +# asm 1: pxor r10=reg128#14 +# asm 2: movdqa r10=%xmm13 +movdqa %xmm13,%xmm13 + +# qhasm: a10 = mem128[ input_1 + 160 ] +# asm 1: movdqu 160(a10=reg128#15 +# asm 2: movdqu 160(a10=%xmm14 +movdqu 160(%rsi),%xmm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r22 ^= r +# asm 1: pxor r9=reg128#13 +# asm 2: movdqa r9=%xmm12 +movdqa %xmm12,%xmm12 + +# qhasm: a9 = mem128[ input_1 + 144 ] +# asm 1: movdqu 144(a9=reg128#15 +# asm 2: movdqu 144(a9=%xmm14 +movdqu 144(%rsi),%xmm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r21 ^= r +# asm 1: pxor r8=reg128#12 +# asm 2: movdqa r8=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: a8 = mem128[ input_1 + 128 ] +# asm 1: movdqu 128(a8=reg128#15 +# asm 2: movdqu 128(a8=%xmm14 +movdqu 128(%rsi),%xmm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r20 ^= r +# asm 1: pxor r7=reg128#11 +# asm 2: movdqa r7=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: a7 = mem128[ input_1 + 112 ] +# asm 1: movdqu 112(a7=reg128#15 +# asm 2: movdqu 112(a7=%xmm14 +movdqu 112(%rsi),%xmm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r6=reg128#10 +# asm 2: movdqa r6=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: a6 = mem128[ input_1 + 96 ] +# asm 1: movdqu 96(a6=reg128#15 +# asm 2: movdqu 96(a6=%xmm14 +movdqu 96(%rsi),%xmm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r5=reg128#9 +# asm 2: movdqa r5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: a5 = mem128[ input_1 + 80 ] +# asm 1: movdqu 80(a5=reg128#15 +# asm 2: movdqu 80(a5=%xmm14 +movdqu 80(%rsi),%xmm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r4=reg128#8 +# asm 2: movdqa r4=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: a4 = mem128[ input_1 + 64 ] +# asm 1: movdqu 64(a4=reg128#15 +# asm 2: movdqu 64(a4=%xmm14 +movdqu 64(%rsi),%xmm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r3=reg128#7 +# asm 2: movdqa r3=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: a3 = mem128[ input_1 + 48 ] +# asm 1: movdqu 48(a3=reg128#15 +# asm 2: movdqu 48(a3=%xmm14 +movdqu 48(%rsi),%xmm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r2=reg128#6 +# asm 2: movdqa r2=%xmm5 +movdqa %xmm5,%xmm5 + +# qhasm: a2 = mem128[ input_1 + 32 ] +# asm 1: movdqu 32(a2=reg128#15 +# asm 2: movdqu 32(a2=%xmm14 +movdqu 32(%rsi),%xmm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r1=reg128#5 +# asm 2: movdqa r1=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: a1 = mem128[ input_1 + 16 ] +# asm 1: movdqu 16(a1=reg128#15 +# asm 2: movdqu 16(a1=%xmm14 +movdqu 16(%rsi),%xmm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r0=reg128#4 +# asm 2: movdqa r0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: a0 = mem128[ input_1 + 0 ] +# asm 1: movdqu 0(a0=reg128#15 +# asm 2: movdqu 0(a0=%xmm14 +movdqu 0(%rsi),%xmm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm0,%xmm14,%xmm0 + +# qhasm: r0 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 16(r=%xmm0 +vpand 16(%rdx),%xmm14,%xmm0 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 32(r=%xmm0 +vpand 32(%rdx),%xmm14,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 48(r=%xmm0 +vpand 48(%rdx),%xmm14,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 64(r=%xmm0 +vpand 64(%rdx),%xmm14,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 80(r=%xmm0 +vpand 80(%rdx),%xmm14,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 96(r=%xmm0 +vpand 96(%rdx),%xmm14,%xmm0 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 112(r=%xmm0 +vpand 112(%rdx),%xmm14,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 128(r=%xmm0 +vpand 128(%rdx),%xmm14,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 144(r=%xmm0 +vpand 144(%rdx),%xmm14,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 160(r=%xmm0 +vpand 160(%rdx),%xmm14,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 176(r=%xmm0 +vpand 176(%rdx),%xmm14,%xmm0 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 192(r=%xmm0 +vpand 192(%rdx),%xmm14,%xmm0 + +# qhasm: r12 ^= r +# asm 1: pxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE460896_VEC_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece460896/vec/api.h b/crypto_kem/mceliece460896/vec/api.h new file mode 100644 index 00000000..84529cd9 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_API_H +#define PQCLEAN_MCELIECE460896_VEC_API_H + +#include + +#define PQCLEAN_MCELIECE460896_VEC_CRYPTO_ALGNAME "Classic McEliece 460896" +#define PQCLEAN_MCELIECE460896_VEC_CRYPTO_PUBLICKEYBYTES 524160 +#define PQCLEAN_MCELIECE460896_VEC_CRYPTO_SECRETKEYBYTES 13568 +#define PQCLEAN_MCELIECE460896_VEC_CRYPTO_CIPHERTEXTBYTES 188 +#define PQCLEAN_MCELIECE460896_VEC_CRYPTO_BYTES 32 + +int PQCLEAN_MCELIECE460896_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE460896_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE460896_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece460896/vec/benes.c b/crypto_kem/mceliece460896/vec/benes.c new file mode 100644 index 00000000..f240d97a --- /dev/null +++ b/crypto_kem/mceliece460896/vec/benes.c @@ -0,0 +1,147 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" +#include "vec.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE460896_VEC_benes(vec *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = r[i * 2 + 0]; + r_int_v[1][i] = r[i * 2 + 1]; + } + + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + r[i * 2 + 0] = r_int_v[0][i]; + r[i * 2 + 1] = r_int_v[1][i]; + } +} + diff --git a/crypto_kem/mceliece460896/vec/benes.h b/crypto_kem/mceliece460896/vec/benes.h new file mode 100644 index 00000000..9d4727b7 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/benes.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_BENES_H +#define PQCLEAN_MCELIECE460896_VEC_BENES_H +/* + This file is for Benes network related functions +*/ + +#include + +void PQCLEAN_MCELIECE460896_VEC_benes(uint64_t * /*r*/, const unsigned char * /*bits*/, int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece460896/vec/bm.c b/crypto_kem/mceliece460896/vec/bm.c new file mode 100644 index 00000000..48396da6 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/bm.c @@ -0,0 +1,238 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline void vec_cmov(vec *out, const vec *in, uint16_t mask) { + int i; + + vec m0, m1; + + m0 = PQCLEAN_MCELIECE460896_VEC_vec_set1_16b(mask); + m1 = ~m0; + + for (i = 0; i < GFBITS; i++) { + out[i] = (in[i] & m0) | (out[i] & m1); + out[i] = (in[i] & m0) | (out[i] & m1); + } +} + +static inline void interleave(vec *in, int idx0, int idx1, const vec *mask, int b) { + int s = 1 << b; + + vec x, y; + + x = (in[idx0] & mask[0]) | ((in[idx1] & mask[0]) << s); + y = ((in[idx0] & mask[1]) >> s) | (in[idx1] & mask[1]); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, const vec *in) { + int i, k; + + vec mask[4][2]; + vec buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = 0; + } + + mask[0][0] = PQCLEAN_MCELIECE460896_VEC_vec_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE460896_VEC_vec_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE460896_VEC_vec_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE460896_VEC_vec_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE460896_VEC_vec_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE460896_VEC_vec_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE460896_VEC_vec_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE460896_VEC_vec_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ k * 16 + i ] = (buf[i] >> (k * 16)) & GFMASK; + } + } +} + +static void update(vec in[][GFBITS], const gf e) { + int i; + vec tmp; + + for (i = 0; i < GFBITS; i++) { + tmp = (e >> i) & 1; + + in[0][i] = (in[0][i] >> 1) | (in[1][i] << 63); + in[1][i] = (in[1][i] >> 1) | (tmp << 63); + } +} + +static inline gf vec_reduce(vec in[][GFBITS]) { + int i; + vec tmp; + gf ret = 0; + + for (i = GFBITS - 1; i >= 0; i--) { + tmp = in[0][i] ^ in[1][i]; + + tmp ^= tmp >> 32; + tmp ^= tmp >> 16; + tmp ^= tmp >> 8; + tmp ^= tmp >> 4; + tmp ^= tmp >> 2; + tmp ^= tmp >> 1; + + ret <<= 1; + ret |= tmp & 1; + } + + return ret; +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE460896_VEC_bm(vec out[][GFBITS], vec in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + + vec prod[2][GFBITS]; + vec interval[2][GFBITS]; + vec dd[2][GFBITS], bb[2][GFBITS]; + vec B[2][GFBITS], C[2][GFBITS]; + vec B_tmp[2][GFBITS], C_tmp[2][GFBITS]; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[ 64], in[1]); + get_coefs(&coefs[128], in[2]); + get_coefs(&coefs[192], in[3]); + + C[0][0] = 0; + C[1][0] = one << 63; + B[0][0] = 0; + B[1][0] = one << 62; + + for (i = 1; i < GFBITS; i++) { + C[0][i] = C[1][i] = B[0][i] = B[1][i] = 0; + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[0][i] = interval[1][i] = 0; + } + + for (N = 0; N < SYS_T * 2; N++) { + update(interval, coefs[N]); + + PQCLEAN_MCELIECE460896_VEC_vec_mul(prod[0], C[0], interval[0]); + PQCLEAN_MCELIECE460896_VEC_vec_mul(prod[1], C[1], interval[1]); + + d = vec_reduce(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[0][i] = dd[1][i] = PQCLEAN_MCELIECE460896_VEC_vec_setbits((d >> i) & 1); + bb[0][i] = bb[1][i] = PQCLEAN_MCELIECE460896_VEC_vec_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE460896_VEC_vec_mul(B_tmp[0], dd[0], B[0]); + PQCLEAN_MCELIECE460896_VEC_vec_mul(B_tmp[1], dd[1], B[1]); + PQCLEAN_MCELIECE460896_VEC_vec_mul(C_tmp[0], bb[0], C[0]); + PQCLEAN_MCELIECE460896_VEC_vec_mul(C_tmp[1], bb[1], C[1]); + + vec_cmov(B[0], C[0], mask); + vec_cmov(B[1], C[1], mask); + update(B, 0); + + for (i = 0; i < GFBITS; i++) { + C[0][i] = B_tmp[0][i] ^ C_tmp[0][i]; + C[1][i] = B_tmp[1][i] ^ C_tmp[1][i]; + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + out[0][i] = (C[0][i] >> 31) | (C[1][i] << 33); + out[1][i] = C[1][i] >> 31; + } +} + diff --git a/crypto_kem/mceliece460896/vec/bm.h b/crypto_kem/mceliece460896/vec/bm.h new file mode 100644 index 00000000..1c432044 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/bm.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_BM_H +#define PQCLEAN_MCELIECE460896_VEC_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE460896_VEC_bm(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece460896/vec/consts.inc b/crypto_kem/mceliece460896/vec/consts.inc new file mode 100644 index 00000000..1875ca4d --- /dev/null +++ b/crypto_kem/mceliece460896/vec/consts.inc @@ -0,0 +1,1920 @@ +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x9999999966666666, + 0x3C3CC3C3C3C33C3C, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xCC33CC3333CC33CC, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0x00FFFF0000FFFF00 +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x6666666699999999, + 0xC3C33C3C3C3CC3C3, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x33CC33CCCC33CC33, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0xFF0000FFFF0000FF +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x9669966969966996, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x6996699696699669, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x6996699696699669, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x9669966969966996, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, diff --git a/crypto_kem/mceliece460896/vec/controlbits.c b/crypto_kem/mceliece460896/vec/controlbits.c new file mode 100644 index 00000000..56ab9db7 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE460896_VEC_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE460896_VEC_sort_63b(n / 2, x); + PQCLEAN_MCELIECE460896_VEC_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE460896_VEC_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece460896/vec/controlbits.h b/crypto_kem/mceliece460896/vec/controlbits.h new file mode 100644 index 00000000..52ad0be7 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_CONTROLBITS_H +#define PQCLEAN_MCELIECE460896_VEC_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE460896_VEC_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE460896_VEC_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece460896/vec/crypto_hash.h b/crypto_kem/mceliece460896/vec/crypto_hash.h new file mode 100644 index 00000000..c2da7b50 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE460896_VEC_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece460896/vec/decrypt.c b/crypto_kem/mceliece460896/vec/decrypt.c new file mode 100644 index 00000000..c78c891e --- /dev/null +++ b/crypto_kem/mceliece460896/vec/decrypt.c @@ -0,0 +1,191 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" +#include "vec.h" + +#include + +static void scaling(vec out[][GFBITS], vec inv[][GFBITS], const unsigned char *sk, const vec *recv) { + int i, j; + + vec irr_int[2][ GFBITS ]; + vec eval[128][ GFBITS ]; + vec tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE460896_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE460896_VEC_fft(eval, irr_int); + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE460896_VEC_vec_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE460896_VEC_vec_copy(inv[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE460896_VEC_vec_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896_VEC_vec_inv(tmp, inv[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE460896_VEC_vec_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE460896_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896_VEC_vec_copy(inv[0], tmp); + + // + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static void preprocess(vec *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 128; i++) { + recv[i] = PQCLEAN_MCELIECE460896_VEC_load8(r + i * 8); + } +} + +static void postprocess(unsigned char *e, vec *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE460896_VEC_store8(error8 + i * 8, err[i]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec out[][GFBITS], vec inv[][GFBITS], const vec *recv) { + int i, j; + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static int weight_check(const unsigned char *e, const vec *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < (1 << GFBITS); i++) { + w0 += (error[i / 64] >> (i % 64)) & 1; + } + + for (i = 0; i < SYS_N; i++) { + w1 += (e[i / 8] >> (i % 8)) & 1; + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec s0[][ GFBITS ], vec s1[][ GFBITS ]) { + int i, j; + vec diff = 0; + + for (i = 0; i < 4; i++) { + for (j = 0; j < GFBITS; j++) { + diff |= (s0[i][j] ^ s1[i][j]); + } + } + + return (uint16_t)PQCLEAN_MCELIECE460896_VEC_vec_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE460896_VEC_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec inv[ 128 ][ GFBITS ]; + vec scaled[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + + vec error[ 128 ]; + + vec s_priv[ 4 ][ GFBITS ]; + vec s_priv_cmp[ 4 ][ GFBITS ]; + vec locator[2][ GFBITS ]; + + vec recv[ 128 ]; + vec allone; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE460896_VEC_benes(recv, sk + IRR_BYTES, 1); + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE460896_VEC_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE460896_VEC_bm(locator, s_priv); + + PQCLEAN_MCELIECE460896_VEC_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE460896_VEC_vec_setbits(1); + + for (i = 0; i < 128; i++) { + error[i] = PQCLEAN_MCELIECE460896_VEC_vec_or_reduce(eval[i]); + error[i] ^= allone; + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE460896_VEC_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE460896_VEC_benes(error, sk + IRR_BYTES, 0); + + postprocess(e, error); + + check_weight = (uint16_t)weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece460896/vec/decrypt.h b/crypto_kem/mceliece460896/vec/decrypt.h new file mode 100644 index 00000000..0b3cdda3 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_DECRYPT_H +#define PQCLEAN_MCELIECE460896_VEC_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE460896_VEC_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece460896/vec/encrypt.c b/crypto_kem/mceliece460896/vec/encrypt.c new file mode 100644 index 00000000..ad71e496 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/encrypt.c @@ -0,0 +1,138 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind[ SYS_T * 2 ]; + uint8_t *ind8 = (uint8_t *)ind; + uint32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes(ind8, sizeof(ind)); + for (i = 0; i < sizeof(ind); i += 2) { + ind[i / 2] = (uint16_t)ind8[i + 1] << 8 | ind8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind32[i] == ind32[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + PQCLEAN_MCELIECE460896_VEC_store8(e, e_int[i]); + e += 8; + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + uint64_t b; + + const uint8_t *e_ptr8 = e + SYND_BYTES; + const uint8_t *pk_ptr8; + + int i, j; + + // + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = e[i]; + } + + for (i = 0; i < PK_NROWS; i++) { + pk_ptr8 = pk + PK_ROW_BYTES * i; + + b = 0; + for (j = 0; j < PK_NCOLS / 64; j++) { + b ^= PQCLEAN_MCELIECE460896_VEC_load8(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE460896_VEC_load8(e_ptr8 + 8 * j); + } + + b ^= PQCLEAN_MCELIECE460896_VEC_load4(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE460896_VEC_load4(e_ptr8 + 8 * j); + + b ^= b >> 32; + b ^= b >> 16; + b ^= b >> 8; + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] ^= (b << (i % 8)); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE460896_VEC_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece460896/vec/encrypt.h b/crypto_kem/mceliece460896/vec/encrypt.h new file mode 100644 index 00000000..a864abf6 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_ENCRYPT_H +#define PQCLEAN_MCELIECE460896_VEC_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE460896_VEC_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece460896/vec/fft.c b/crypto_kem/mceliece460896/vec/fft.c new file mode 100644 index 00000000..d0a9b954 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/fft.c @@ -0,0 +1,269 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec in[][GFBITS]) { + int i, j, k; + + const vec mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const vec s[5][2][GFBITS] = { +#include "scalars_2x.inc" + }; + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[1][i] >> 32; + in[0][i] ^= in[1][i] << 32; + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[0][i] ^= (in[0][i] & mask[k][0]) >> (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) >> (1 << k); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE460896_VEC_vec_mul(in[0], in[0], s[j][0]); + PQCLEAN_MCELIECE460896_VEC_vec_mul(in[1], in[1], s[j][1]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[8][ GFBITS ]; + vec buf[128]; + + uint64_t consts_ptr = 2; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[7] = {2522, 7827, 7801, 8035, 6897, 8167, 3476}; + + // + + for (i = 0; i < 7; i++) { + for (j = 0; j < GFBITS; j++) { + pre[i][j] = (beta[i] >> j) & 1; + pre[i][j] = -pre[i][j]; + } + + PQCLEAN_MCELIECE460896_VEC_vec_mul(pre[i], in[1], pre[i]); + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = in[0][i]; + + buf[1] = buf[0] ^ pre[0][i]; + buf[32] = in[0][i] ^ pre[5][i]; + buf[3] = buf[1] ^ pre[1][i]; + buf[96] = buf[32] ^ pre[6][i]; + buf[97] = buf[96] ^ pre[0][i]; + buf[2] = in[0][i] ^ pre[1][i]; + buf[99] = buf[97] ^ pre[1][i]; + buf[6] = buf[2] ^ pre[2][i]; + buf[98] = buf[99] ^ pre[0][i]; + buf[7] = buf[6] ^ pre[0][i]; + buf[102] = buf[98] ^ pre[2][i]; + buf[5] = buf[7] ^ pre[1][i]; + buf[103] = buf[102] ^ pre[0][i]; + buf[101] = buf[103] ^ pre[1][i]; + buf[4] = in[0][i] ^ pre[2][i]; + buf[100] = buf[101] ^ pre[0][i]; + buf[12] = buf[4] ^ pre[3][i]; + buf[108] = buf[100] ^ pre[3][i]; + buf[13] = buf[12] ^ pre[0][i]; + buf[109] = buf[108] ^ pre[0][i]; + buf[15] = buf[13] ^ pre[1][i]; + buf[111] = buf[109] ^ pre[1][i]; + buf[14] = buf[15] ^ pre[0][i]; + buf[110] = buf[111] ^ pre[0][i]; + buf[10] = buf[14] ^ pre[2][i]; + buf[106] = buf[110] ^ pre[2][i]; + buf[11] = buf[10] ^ pre[0][i]; + buf[107] = buf[106] ^ pre[0][i]; + buf[9] = buf[11] ^ pre[1][i]; + buf[105] = buf[107] ^ pre[1][i]; + buf[104] = buf[105] ^ pre[0][i]; + buf[8] = in[0][i] ^ pre[3][i]; + buf[120] = buf[104] ^ pre[4][i]; + buf[24] = buf[8] ^ pre[4][i]; + buf[121] = buf[120] ^ pre[0][i]; + buf[25] = buf[24] ^ pre[0][i]; + buf[123] = buf[121] ^ pre[1][i]; + buf[27] = buf[25] ^ pre[1][i]; + buf[122] = buf[123] ^ pre[0][i]; + buf[26] = buf[27] ^ pre[0][i]; + buf[126] = buf[122] ^ pre[2][i]; + buf[30] = buf[26] ^ pre[2][i]; + buf[127] = buf[126] ^ pre[0][i]; + buf[31] = buf[30] ^ pre[0][i]; + buf[125] = buf[127] ^ pre[1][i]; + buf[29] = buf[31] ^ pre[1][i]; + buf[124] = buf[125] ^ pre[0][i]; + buf[28] = buf[29] ^ pre[0][i]; + buf[116] = buf[124] ^ pre[3][i]; + buf[20] = buf[28] ^ pre[3][i]; + buf[117] = buf[116] ^ pre[0][i]; + buf[21] = buf[20] ^ pre[0][i]; + buf[119] = buf[117] ^ pre[1][i]; + buf[23] = buf[21] ^ pre[1][i]; + buf[118] = buf[119] ^ pre[0][i]; + buf[22] = buf[23] ^ pre[0][i]; + buf[114] = buf[118] ^ pre[2][i]; + buf[18] = buf[22] ^ pre[2][i]; + buf[115] = buf[114] ^ pre[0][i]; + buf[19] = buf[18] ^ pre[0][i]; + buf[113] = buf[115] ^ pre[1][i]; + buf[17] = buf[19] ^ pre[1][i]; + buf[112] = buf[113] ^ pre[0][i]; + buf[80] = buf[112] ^ pre[5][i]; + buf[16] = in[0][i] ^ pre[4][i]; + buf[81] = buf[80] ^ pre[0][i]; + buf[48] = buf[16] ^ pre[5][i]; + buf[83] = buf[81] ^ pre[1][i]; + buf[49] = buf[48] ^ pre[0][i]; + buf[82] = buf[83] ^ pre[0][i]; + buf[51] = buf[49] ^ pre[1][i]; + buf[86] = buf[82] ^ pre[2][i]; + buf[50] = buf[51] ^ pre[0][i]; + buf[87] = buf[86] ^ pre[0][i]; + buf[54] = buf[50] ^ pre[2][i]; + buf[85] = buf[87] ^ pre[1][i]; + buf[55] = buf[54] ^ pre[0][i]; + buf[84] = buf[85] ^ pre[0][i]; + buf[53] = buf[55] ^ pre[1][i]; + buf[92] = buf[84] ^ pre[3][i]; + buf[52] = buf[53] ^ pre[0][i]; + buf[93] = buf[92] ^ pre[0][i]; + buf[60] = buf[52] ^ pre[3][i]; + buf[95] = buf[93] ^ pre[1][i]; + buf[61] = buf[60] ^ pre[0][i]; + buf[94] = buf[95] ^ pre[0][i]; + buf[63] = buf[61] ^ pre[1][i]; + buf[90] = buf[94] ^ pre[2][i]; + buf[62] = buf[63] ^ pre[0][i]; + buf[91] = buf[90] ^ pre[0][i]; + buf[58] = buf[62] ^ pre[2][i]; + buf[89] = buf[91] ^ pre[1][i]; + buf[59] = buf[58] ^ pre[0][i]; + buf[88] = buf[89] ^ pre[0][i]; + buf[57] = buf[59] ^ pre[1][i]; + buf[72] = buf[88] ^ pre[4][i]; + buf[56] = buf[57] ^ pre[0][i]; + buf[73] = buf[72] ^ pre[0][i]; + buf[40] = buf[56] ^ pre[4][i]; + buf[75] = buf[73] ^ pre[1][i]; + buf[41] = buf[40] ^ pre[0][i]; + buf[74] = buf[75] ^ pre[0][i]; + buf[43] = buf[41] ^ pre[1][i]; + buf[78] = buf[74] ^ pre[2][i]; + buf[42] = buf[43] ^ pre[0][i]; + buf[79] = buf[78] ^ pre[0][i]; + buf[46] = buf[42] ^ pre[2][i]; + buf[77] = buf[79] ^ pre[1][i]; + buf[47] = buf[46] ^ pre[0][i]; + buf[76] = buf[77] ^ pre[0][i]; + buf[45] = buf[47] ^ pre[1][i]; + buf[68] = buf[76] ^ pre[3][i]; + buf[44] = buf[45] ^ pre[0][i]; + buf[69] = buf[68] ^ pre[0][i]; + buf[36] = buf[44] ^ pre[3][i]; + buf[71] = buf[69] ^ pre[1][i]; + buf[37] = buf[36] ^ pre[0][i]; + buf[70] = buf[71] ^ pre[0][i]; + buf[39] = buf[37] ^ pre[1][i]; + buf[66] = buf[70] ^ pre[2][i]; + buf[38] = buf[39] ^ pre[0][i]; + buf[67] = buf[66] ^ pre[0][i]; + buf[34] = buf[38] ^ pre[2][i]; + buf[65] = buf[67] ^ pre[1][i]; + buf[35] = buf[34] ^ pre[0][i]; + buf[33] = buf[35] ^ pre[1][i]; + buf[64] = in[0][i] ^ pre[6][i]; + + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(buf + 0, buf + 0); + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(buf + 64, buf + 64); + + for (j = 0; j < 128; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 1; i <= 6; i++) { + s = 1 << i; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE460896_VEC_vec_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += ((uint64_t)1 << i); + } + + // adding the part contributed by x^128 + +// for (i = 0; i < 128; i++) +// for (b = 0; b < GFBITS; b++) +// out[i][b] ^= powers[i][b]; +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE460896_VEC_fft(vec out[][GFBITS], vec in[][GFBITS]) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece460896/vec/fft.h b/crypto_kem/mceliece460896/vec/fft.h new file mode 100644 index 00000000..d58828f5 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_FFT_H +#define PQCLEAN_MCELIECE460896_VEC_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE460896_VEC_fft(vec out[][ GFBITS ], vec in[][GFBITS]); + +#endif + diff --git a/crypto_kem/mceliece460896/vec/fft_tr.c b/crypto_kem/mceliece460896/vec/fft_tr.c new file mode 100644 index 00000000..35b42a7a --- /dev/null +++ b/crypto_kem/mceliece460896/vec/fft_tr.c @@ -0,0 +1,299 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec in[][ GFBITS ]) { + int i, j, k; + + const vec mask[6][2] = { + {0x2222222222222222, 0x4444444444444444}, + {0x0C0C0C0C0C0C0C0C, 0x3030303030303030}, + {0x00F000F000F000F0, 0x0F000F000F000F00}, + {0x0000FF000000FF00, 0x00FF000000FF0000}, + {0x00000000FFFF0000, 0x0000FFFF00000000}, + {0xFFFFFFFF00000000, 0x00000000FFFFFFFF} + }; + + const vec s[6][4][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE460896_VEC_vec_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE460896_VEC_vec_mul(in[1], in[1], s[j][1]); // scaling + PQCLEAN_MCELIECE460896_VEC_vec_mul(in[2], in[2], s[j][2]); // scaling + PQCLEAN_MCELIECE460896_VEC_vec_mul(in[3], in[3], s[j][3]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + in[0][i] ^= (in[0][i] & mask[k][0]) << (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][0]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][1]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][0]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][1]) << (1 << k); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[0][i] >> 32; + in[1][i] ^= in[1][i] << 32; + + in[3][i] ^= in[2][i] >> 32; + in[3][i] ^= in[3][i] << 32; + } + } + + for (i = 0; i < GFBITS; i++) { + in[3][i] ^= in[2][i] ^= in[1][i]; + } + } +} + +static void butterflies_tr(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[6][2][ GFBITS ]; + vec buf[2][64]; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 128; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 6; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + + PQCLEAN_MCELIECE460896_VEC_vec_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tmp[b]; + } + } + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 128; k++) { + (&buf[0][0])[ k ] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(buf[0], buf[0]); + PQCLEAN_MCELIECE460896_VEC_transpose_64x64(buf[1], buf[1]); + + for (k = 0; k < 2; k++) { + pre[0][k][i] = buf[k][32]; + buf[k][33] ^= buf[k][32]; + pre[1][k][i] = buf[k][33]; + buf[k][35] ^= buf[k][33]; + pre[0][k][i] ^= buf[k][35]; + buf[k][34] ^= buf[k][35]; + pre[2][k][i] = buf[k][34]; + buf[k][38] ^= buf[k][34]; + pre[0][k][i] ^= buf[k][38]; + buf[k][39] ^= buf[k][38]; + pre[1][k][i] ^= buf[k][39]; + buf[k][37] ^= buf[k][39]; + pre[0][k][i] ^= buf[k][37]; + buf[k][36] ^= buf[k][37]; + pre[3][k][i] = buf[k][36]; + buf[k][44] ^= buf[k][36]; + pre[0][k][i] ^= buf[k][44]; + buf[k][45] ^= buf[k][44]; + pre[1][k][i] ^= buf[k][45]; + buf[k][47] ^= buf[k][45]; + pre[0][k][i] ^= buf[k][47]; + buf[k][46] ^= buf[k][47]; + pre[2][k][i] ^= buf[k][46]; + buf[k][42] ^= buf[k][46]; + pre[0][k][i] ^= buf[k][42]; + buf[k][43] ^= buf[k][42]; + pre[1][k][i] ^= buf[k][43]; + buf[k][41] ^= buf[k][43]; + pre[0][k][i] ^= buf[k][41]; + buf[k][40] ^= buf[k][41]; + pre[4][k][i] = buf[k][40]; + buf[k][56] ^= buf[k][40]; + pre[0][k][i] ^= buf[k][56]; + buf[k][57] ^= buf[k][56]; + pre[1][k][i] ^= buf[k][57]; + buf[k][59] ^= buf[k][57]; + pre[0][k][i] ^= buf[k][59]; + buf[k][58] ^= buf[k][59]; + pre[2][k][i] ^= buf[k][58]; + buf[k][62] ^= buf[k][58]; + pre[0][k][i] ^= buf[k][62]; + buf[k][63] ^= buf[k][62]; + pre[1][k][i] ^= buf[k][63]; + buf[k][61] ^= buf[k][63]; + pre[0][k][i] ^= buf[k][61]; + buf[k][60] ^= buf[k][61]; + pre[3][k][i] ^= buf[k][60]; + buf[k][52] ^= buf[k][60]; + pre[0][k][i] ^= buf[k][52]; + buf[k][53] ^= buf[k][52]; + pre[1][k][i] ^= buf[k][53]; + buf[k][55] ^= buf[k][53]; + pre[0][k][i] ^= buf[k][55]; + buf[k][54] ^= buf[k][55]; + pre[2][k][i] ^= buf[k][54]; + buf[k][50] ^= buf[k][54]; + pre[0][k][i] ^= buf[k][50]; + buf[k][51] ^= buf[k][50]; + pre[1][k][i] ^= buf[k][51]; + buf[k][49] ^= buf[k][51]; + pre[0][k][i] ^= buf[k][49]; + buf[k][48] ^= buf[k][49]; + pre[5][k][i] = buf[k][48]; + buf[k][16] ^= buf[k][48]; + pre[0][k][i] ^= buf[k][16]; + buf[k][17] ^= buf[k][16]; + pre[1][k][i] ^= buf[k][17]; + buf[k][19] ^= buf[k][17]; + pre[0][k][i] ^= buf[k][19]; + buf[k][18] ^= buf[k][19]; + pre[2][k][i] ^= buf[k][18]; + buf[k][22] ^= buf[k][18]; + pre[0][k][i] ^= buf[k][22]; + buf[k][23] ^= buf[k][22]; + pre[1][k][i] ^= buf[k][23]; + buf[k][21] ^= buf[k][23]; + pre[0][k][i] ^= buf[k][21]; + buf[k][20] ^= buf[k][21]; + pre[3][k][i] ^= buf[k][20]; + buf[k][28] ^= buf[k][20]; + pre[0][k][i] ^= buf[k][28]; + buf[k][29] ^= buf[k][28]; + pre[1][k][i] ^= buf[k][29]; + buf[k][31] ^= buf[k][29]; + pre[0][k][i] ^= buf[k][31]; + buf[k][30] ^= buf[k][31]; + pre[2][k][i] ^= buf[k][30]; + buf[k][26] ^= buf[k][30]; + pre[0][k][i] ^= buf[k][26]; + buf[k][27] ^= buf[k][26]; + pre[1][k][i] ^= buf[k][27]; + buf[k][25] ^= buf[k][27]; + pre[0][k][i] ^= buf[k][25]; + buf[k][24] ^= buf[k][25]; + pre[4][k][i] ^= buf[k][24]; + buf[k][8] ^= buf[k][24]; + pre[0][k][i] ^= buf[k][8]; + buf[k][9] ^= buf[k][8]; + pre[1][k][i] ^= buf[k][9]; + buf[k][11] ^= buf[k][9]; + pre[0][k][i] ^= buf[k][11]; + buf[k][10] ^= buf[k][11]; + pre[2][k][i] ^= buf[k][10]; + buf[k][14] ^= buf[k][10]; + pre[0][k][i] ^= buf[k][14]; + buf[k][15] ^= buf[k][14]; + pre[1][k][i] ^= buf[k][15]; + buf[k][13] ^= buf[k][15]; + pre[0][k][i] ^= buf[k][13]; + buf[k][12] ^= buf[k][13]; + pre[3][k][i] ^= buf[k][12]; + buf[k][4] ^= buf[k][12]; + pre[0][k][i] ^= buf[k][4]; + buf[k][5] ^= buf[k][4]; + pre[1][k][i] ^= buf[k][5]; + buf[k][7] ^= buf[k][5]; + pre[0][k][i] ^= buf[k][7]; + buf[k][6] ^= buf[k][7]; + pre[2][k][i] ^= buf[k][6]; + buf[k][2] ^= buf[k][6]; + pre[0][k][i] ^= buf[k][2]; + buf[k][3] ^= buf[k][2]; + pre[1][k][i] ^= buf[k][3]; + buf[k][1] ^= buf[k][3]; + + pre[0][k][i] ^= buf[k][1]; + out[k][i] = buf[k][0] ^ buf[k][1]; + } + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896_VEC_vec_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE460896_VEC_vec_mul(out[2], pre[0][0], tmp); + PQCLEAN_MCELIECE460896_VEC_vec_mul(out[3], pre[0][1], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896_VEC_vec_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE460896_VEC_vec_mul(pre[i][0], pre[i][0], tmp); + PQCLEAN_MCELIECE460896_VEC_vec_mul(pre[i][1], pre[i][1], tmp); + + for (b = 0; b < GFBITS; b++) { + out[2][b] ^= pre[i][0][b]; + out[3][b] ^= pre[i][1][b]; + } + } + +} + +/* justifying the length of the output */ +static void postprocess(vec out[4][GFBITS]) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[3][i] = 0; + } +} + +void PQCLEAN_MCELIECE460896_VEC_fft_tr(vec out[][GFBITS], vec in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/crypto_kem/mceliece460896/vec/fft_tr.h b/crypto_kem/mceliece460896/vec/fft_tr.h new file mode 100644 index 00000000..83b0da30 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_FFT_TR_H +#define PQCLEAN_MCELIECE460896_VEC_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE460896_VEC_fft_tr(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece460896/vec/gf.c b/crypto_kem/mceliece460896/vec/gf.c new file mode 100644 index 00000000..55364ac6 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/gf.c @@ -0,0 +1,205 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE460896_VEC_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE460896_VEC_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE460896_VEC_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE460896_VEC_gf_inv(gf in) { + return PQCLEAN_MCELIECE460896_VEC_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE460896_VEC_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[191]; + + for (i = 0; i < 191; i++) { + prod[i] = 0; + } + + for (i = 0; i < 96; i++) { + for (j = 0; j < 96; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE460896_VEC_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 190; i >= 96; i--) { + prod[i - 85] ^= PQCLEAN_MCELIECE460896_VEC_gf_mul(prod[i], (gf) 714); + prod[i - 91] ^= PQCLEAN_MCELIECE460896_VEC_gf_mul(prod[i], (gf) 5296); + prod[i - 92] ^= PQCLEAN_MCELIECE460896_VEC_gf_mul(prod[i], (gf) 728); + prod[i - 96] ^= PQCLEAN_MCELIECE460896_VEC_gf_mul(prod[i], (gf) 5881); + } + + for (i = 0; i < 96; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece460896/vec/gf.h b/crypto_kem/mceliece460896/vec/gf.h new file mode 100644 index 00000000..dce2307b --- /dev/null +++ b/crypto_kem/mceliece460896/vec/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_GF_H +#define PQCLEAN_MCELIECE460896_VEC_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE460896_VEC_gf_iszero(gf a); +gf PQCLEAN_MCELIECE460896_VEC_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE460896_VEC_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE460896_VEC_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE460896_VEC_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE460896_VEC_gf_inv(gf in); + +void PQCLEAN_MCELIECE460896_VEC_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece460896/vec/operations.c b/crypto_kem/mceliece460896/vec/operations.c new file mode 100644 index 00000000..3ecd54ce --- /dev/null +++ b/crypto_kem/mceliece460896/vec/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE460896_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE460896_VEC_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE460896_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE460896_VEC_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE460896_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE460896_VEC_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE460896_VEC_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE460896_VEC_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE460896_VEC_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE460896_VEC_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE460896_VEC_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE460896_VEC_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE460896_VEC_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896/vec/params.h b/crypto_kem/mceliece460896/vec/params.h new file mode 100644 index 00000000..a8a9184d --- /dev/null +++ b/crypto_kem/mceliece460896/vec/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_PARAMS_H +#define PQCLEAN_MCELIECE460896_VEC_PARAMS_H + +#define GFBITS 13 +#define SYS_N 4608 +#define SYS_T 96 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece460896/vec/pk_gen.c b/crypto_kem/mceliece460896/vec/pk_gen.c new file mode 100644 index 00000000..b95406d7 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/pk_gen.c @@ -0,0 +1,250 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec in[][GFBITS]) { + int i, j, r; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 0; r < 64; r++) { + out[i * 64 + r] <<= 1; + out[i * 64 + r] |= (in[i][j] >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec out0[][GFBITS], vec out1[][GFBITS], const uint64_t *in) { + int i, j, r; + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out1[i][j] <<= 1; + out1[i][j] |= (in[i * 64 + r] >> (j + GFBITS)) & 1; + } + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out0[i][GFBITS - 1 - j] <<= 1; + out0[i][GFBITS - 1 - j] |= (in[i * 64 + r] >> j) & 1; + } + } + } +} + +int PQCLEAN_MCELIECE460896_VEC_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { +#define NBLOCKS_H ((SYS_N + 63) / 64) +#define NBLOCKS_I ((GFBITS * SYS_T + 63) / 64) + const int block_idx = NBLOCKS_I - 1; + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS_H ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS_I ]; + + uint64_t mask; + + vec irr_int[2][ GFBITS ]; + + vec consts[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + vec prod[ 128 ][ GFBITS ]; + vec tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE460896_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE460896_VEC_fft(eval, irr_int); + + PQCLEAN_MCELIECE460896_VEC_vec_copy(prod[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE460896_VEC_vec_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896_VEC_vec_inv(tmp, prod[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE460896_VEC_vec_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE460896_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896_VEC_vec_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE460896_VEC_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS_I; j++) { + PQCLEAN_MCELIECE460896_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + uint64_t column[ PK_NROWS ]; + + for (i = 0; i < PK_NROWS; i++) { + column[i] = mat[ i ][ block_idx ]; + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS_I; j < NBLOCKS_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS_I; j < NBLOCKS_H; j++) { + PQCLEAN_MCELIECE460896_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + for (i = 0; i < PK_NROWS; i++) { + mat[ i ][ block_idx ] = column[i]; + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = 0; k < NBLOCKS_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS_H - 1; k++) { + one_row[k] = (one_row[k] >> tail) | (one_row[k + 1] << (64 - tail)); + PQCLEAN_MCELIECE460896_VEC_store8(pk, one_row[k]); + pk += 8; + } + + one_row[k] >>= tail; + PQCLEAN_MCELIECE460896_VEC_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece460896/vec/pk_gen.h b/crypto_kem/mceliece460896/vec/pk_gen.h new file mode 100644 index 00000000..03f373e9 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_PK_GEN_H +#define PQCLEAN_MCELIECE460896_VEC_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE460896_VEC_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece460896/vec/scalars_2x.inc b/crypto_kem/mceliece460896/vec/scalars_2x.inc new file mode 100644 index 00000000..a0abb162 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/scalars_2x.inc @@ -0,0 +1,150 @@ +{{ + 0X3C3CF30C0000C003, + 0X0CCCC3F333C0000C, + 0X03C33F33FCC0C03C, + 0X0003000F3C03C0C0, + 0XF33FF33030CF03F0, + 0X0CF0303300F0CCC0, + 0XFF3F0C0CC0FF3CC0, + 0XCF3CF0FF003FC000, + 0XC00FF3CF0303F300, + 0X3CCC0CC00CF0CC00, + 0XF30FFC3C3FCCFC00, + 0X3F0FC3F0CCF0C000, + 0X3000FF33CCF0F000 +}, +{ + 0X0C0F0FCF0F0CF330, + 0XF0000FC33C3CCF3C, + 0X3C0F3F00C3C300FC, + 0X3C33CCC0F0F3CC30, + 0XC0CFFFFFCCCC30CC, + 0X3FC3F3CCFFFC033F, + 0XFC3030CCCCC0CFCF, + 0X0FCF0C00CCF333C3, + 0XCFFCF33000CFF030, + 0X00CFFCC330F30FCC, + 0X3CCC3FCCC0F3FFF3, + 0XF00F0C3FC003C0FF, + 0X330CCFCC03C0FC33 +}}, +{{ + 0X0F0F0FF0F000000F, + 0X00FFFFFFFF0000F0, + 0XFFFF00FF00000F00, + 0XFFF000F00F0FF000, + 0XFFF0000F0FF000F0, + 0X00FF000FFF000000, + 0XFF0F0FFF0F0FF000, + 0X0FFF0000000F0000, + 0X00F000F0FFF00F00, + 0X00F00FF00F00F000, + 0XFFF000F000F00000, + 0X00F00F000FF00000, + 0X0000FF0F0000F000 +}, +{ + 0XF0FFFFFFF0F00F00, + 0X00FFF0FFFF0000FF, + 0X00FF00000F0F0FFF, + 0XF000F0000F00FF0F, + 0XFF000000FFF00000, + 0XF0FF000FF00F0FF0, + 0X0F0F0F00FF000F0F, + 0X0F0F00F0F0F0F000, + 0X00F00F00F00F000F, + 0X00F0F0F00000FFF0, + 0XFFFFFF0FF00F0FFF, + 0X0F0FFFF00FFFFFFF, + 0XFFFF0F0FFF0FFF00 +}}, +{{ + 0X00FF0000000000FF, + 0XFFFFFFFFFF00FF00, + 0XFF0000FF00FF0000, + 0XFFFF000000FF0000, + 0XFF00000000FF0000, + 0X00FFFFFFFF000000, + 0XFF0000FFFFFF0000, + 0XFF00FF00FFFF0000, + 0X00FFFFFFFF00FF00, + 0XFFFF000000000000, + 0X00FF0000FF000000, + 0XFF00FF00FF000000, + 0X00FF00FFFF000000 +}, +{ + 0X00FF00FF00FF0000, + 0XFF00FFFF000000FF, + 0X0000FFFF000000FF, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF00FF, + 0X0000FFFF00FFFF00, + 0XFF00FF0000FFFF00, + 0X00000000FFFFFFFF, + 0X0000FF0000000000, + 0XFF00FFFF00FFFF00, + 0X00FFFF00000000FF, + 0X0000FF00FF00FFFF, + 0XFF0000FFFFFF0000 +}}, +{{ + 0X000000000000FFFF, + 0XFFFFFFFFFFFF0000, + 0X0000000000000000, + 0XFFFF0000FFFF0000, + 0XFFFFFFFFFFFF0000, + 0X0000FFFF00000000, + 0X0000FFFFFFFF0000, + 0XFFFF0000FFFF0000, + 0X0000FFFF00000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000FFFF00000000, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0X0000FFFF00000000, + 0XFFFF0000FFFF0000, + 0X0000FFFFFFFF0000, + 0X0000FFFF0000FFFF, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFF0000, + 0XFFFF0000FFFFFFFF, + 0XFFFF0000FFFFFFFF, + 0X0000000000000000 +}}, +{{ + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000 +}} diff --git a/crypto_kem/mceliece460896/vec/scalars_4x.inc b/crypto_kem/mceliece460896/vec/scalars_4x.inc new file mode 100644 index 00000000..cbaccec7 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/scalars_4x.inc @@ -0,0 +1,360 @@ +{{ + 0x3C3CF30C0000C003, + 0x0CCCC3F333C0000C, + 0x03C33F33FCC0C03C, + 0x0003000F3C03C0C0, + 0xF33FF33030CF03F0, + 0x0CF0303300F0CCC0, + 0xFF3F0C0CC0FF3CC0, + 0xCF3CF0FF003FC000, + 0xC00FF3CF0303F300, + 0x3CCC0CC00CF0CC00, + 0xF30FFC3C3FCCFC00, + 0x3F0FC3F0CCF0C000, + 0x3000FF33CCF0F000 +}, +{ + 0x0C0F0FCF0F0CF330, + 0xF0000FC33C3CCF3C, + 0x3C0F3F00C3C300FC, + 0x3C33CCC0F0F3CC30, + 0xC0CFFFFFCCCC30CC, + 0x3FC3F3CCFFFC033F, + 0xFC3030CCCCC0CFCF, + 0x0FCF0C00CCF333C3, + 0xCFFCF33000CFF030, + 0x00CFFCC330F30FCC, + 0x3CCC3FCCC0F3FFF3, + 0xF00F0C3FC003C0FF, + 0x330CCFCC03C0FC33 +}, +{ + 0xF0F30C33CF03F03F, + 0x00F30FC00C3300FF, + 0xF3CC3CF3F3FCF33F, + 0x3C0FC0FC303C3F3C, + 0xFC30CF303F3FF00F, + 0x33300C0CC3300CF3, + 0x3C030CF3F03FF3F3, + 0x3CCC03FCCC3FFC03, + 0x033C3C3CF0003FC3, + 0xFFC0FF00F0FF0F03, + 0xF3F30CF003FCC303, + 0x30CFCFC3CC0F3000, + 0x0CF30CCF3FCFCC0F +}, +{ + 0x3F30CC0C000F3FCC, + 0xFC3CF030FC3FFF03, + 0x33FFFCFF0CCF3CC3, + 0x003CFF33C3CC30CF, + 0xCFF3CF33C00F3003, + 0x00F3CC0CF3003CCF, + 0x3C000CFCCC3C3333, + 0xF3CF03C0FCF03FF0, + 0x3F3C3CF0C330330C, + 0x33CCFCC0FF0033F0, + 0x33C300C0F0C003F3, + 0x003FF0003F00C00C, + 0xCFF3C3033F030FFF +}}, +{{ + 0x0F0F0FF0F000000F, + 0x00FFFFFFFF0000F0, + 0xFFFF00FF00000F00, + 0xFFF000F00F0FF000, + 0xFFF0000F0FF000F0, + 0x00FF000FFF000000, + 0xFF0F0FFF0F0FF000, + 0x0FFF0000000F0000, + 0x00F000F0FFF00F00, + 0x00F00FF00F00F000, + 0xFFF000F000F00000, + 0x00F00F000FF00000, + 0x0000FF0F0000F000 +}, +{ + 0xF0FFFFFFF0F00F00, + 0x00FFF0FFFF0000FF, + 0x00FF00000F0F0FFF, + 0xF000F0000F00FF0F, + 0xFF000000FFF00000, + 0xF0FF000FF00F0FF0, + 0x0F0F0F00FF000F0F, + 0x0F0F00F0F0F0F000, + 0x00F00F00F00F000F, + 0x00F0F0F00000FFF0, + 0xFFFFFF0FF00F0FFF, + 0x0F0FFFF00FFFFFFF, + 0xFFFF0F0FFF0FFF00 +}, +{ + 0x0F0F00FF0FF0FFFF, + 0xF000F0F00F00FF0F, + 0x000FFFF0FFF0FF0F, + 0x00F00FFF00000FF0, + 0xFFFFF0000FFFF00F, + 0xFFF0FFF0000FFFF0, + 0xF0F0F0000F0F0F00, + 0x00F000F0F00FFF00, + 0xF0FF0F0FFF00F0FF, + 0xF0FF0FFFF0F0F0FF, + 0x00FFFFFFFFFFFFF0, + 0x00FFF0F0FF000F0F, + 0x000FFFF0000FFF00 +}, +{ + 0xFF0F0F00F000F0FF, + 0x0FFFFFFFFF00000F, + 0xF0FFFF000F00F0FF, + 0x0F0000F00FFF0FFF, + 0x0F0F0F00FF0F000F, + 0x000F0F0FFFF0F000, + 0xF0FFFF0F00F0FF0F, + 0x0F0F000F0F00F0FF, + 0x0000F0FF00FF0F0F, + 0x00FFFF0FF0FFF0F0, + 0x0000000F00F0FFF0, + 0xF0F00000FF00F0F0, + 0x0F0F0FFFFFFFFFFF +}}, +{{ + 0x00FF0000000000FF, + 0xFFFFFFFFFF00FF00, + 0xFF0000FF00FF0000, + 0xFFFF000000FF0000, + 0xFF00000000FF0000, + 0x00FFFFFFFF000000, + 0xFF0000FFFFFF0000, + 0xFF00FF00FFFF0000, + 0x00FFFFFFFF00FF00, + 0xFFFF000000000000, + 0x00FF0000FF000000, + 0xFF00FF00FF000000, + 0x00FF00FFFF000000 +}, +{ + 0x00FF00FF00FF0000, + 0xFF00FFFF000000FF, + 0x0000FFFF000000FF, + 0x00FFFF00FF000000, + 0xFFFFFF0000FF00FF, + 0x0000FFFF00FFFF00, + 0xFF00FF0000FFFF00, + 0x00000000FFFFFFFF, + 0x0000FF0000000000, + 0xFF00FFFF00FFFF00, + 0x00FFFF00000000FF, + 0x0000FF00FF00FFFF, + 0xFF0000FFFFFF0000 +}, +{ + 0xFFFF00FF00FF00FF, + 0x00FFFF000000FF00, + 0xFFFF00FFFFFFFF00, + 0x0000FFFF00FFFFFF, + 0x00FF0000FF0000FF, + 0xFFFF0000FF00FFFF, + 0xFF000000FFFFFF00, + 0x000000000000FFFF, + 0xFF00FF00FFFF0000, + 0xFFFF00FFFF00FFFF, + 0xFFFFFFFFFF00FF00, + 0xFFFF00FFFF0000FF, + 0x0000FF00000000FF +}, +{ + 0xFF0000FFFFFF00FF, + 0xFFFF0000FFFFFFFF, + 0xFFFF000000FFFFFF, + 0x00FFFF00FF0000FF, + 0xFFFFFF00FFFFFF00, + 0x00FFFF00FFFF00FF, + 0x0000FFFF00FF0000, + 0x000000FFFF000000, + 0xFF00FF0000FF00FF, + 0x00FF0000000000FF, + 0xFF00FFFF00FF00FF, + 0xFFFFFFFFFFFFFFFF, + 0x0000FF000000FFFF +}}, +{{ + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0x0000000000000000, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFF0000, + 0x0000FFFF00000000, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFF0000, + 0x0000FFFF00000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000FFFF00000000, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000FFFF00000000, + 0xFFFF0000FFFF0000, + 0x0000FFFFFFFF0000, + 0x0000FFFF0000FFFF, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFF0000, + 0xFFFF0000FFFFFFFF, + 0xFFFF0000FFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF000000000000, + 0x0000FFFF00000000, + 0x00000000FFFF0000, + 0x0000FFFFFFFFFFFF, + 0x0000FFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0x000000000000FFFF, + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0xFFFFFFFF0000FFFF, + 0xFFFF0000FFFFFFFF +}, +{ + 0x0000FFFFFFFFFFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFFFFFF, + 0x00000000FFFF0000, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFF00000000, + 0xFFFFFFFF00000000, + 0x0000FFFFFFFF0000, + 0x0000FFFFFFFFFFFF +}}, +{{ + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000 +}, +{ + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000 +}}, +{{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF +}}, diff --git a/crypto_kem/mceliece460896/vec/sk_gen.c b/crypto_kem/mceliece460896/vec/sk_gen.c new file mode 100644 index 00000000..176e097e --- /dev/null +++ b/crypto_kem/mceliece460896/vec/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE460896_VEC_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE460896_VEC_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE460896_VEC_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE460896_VEC_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE460896_VEC_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE460896_VEC_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE460896_VEC_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE460896_VEC_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896/vec/sk_gen.h b/crypto_kem/mceliece460896/vec/sk_gen.h new file mode 100644 index 00000000..9ee8b1b9 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_SK_GEN_H +#define PQCLEAN_MCELIECE460896_VEC_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE460896_VEC_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE460896_VEC_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece460896/vec/transpose.c b/crypto_kem/mceliece460896/vec/transpose.c new file mode 100644 index 00000000..0a361e10 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/transpose.c @@ -0,0 +1,35 @@ +#include "transpose.h" + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE460896_VEC_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} diff --git a/crypto_kem/mceliece460896/vec/transpose.h b/crypto_kem/mceliece460896/vec/transpose.h new file mode 100644 index 00000000..9413e918 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/transpose.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_TRANSPOSE_H +#define PQCLEAN_MCELIECE460896_VEC_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + + +void PQCLEAN_MCELIECE460896_VEC_transpose_64x64(uint64_t *out, const uint64_t *in); + +#endif + diff --git a/crypto_kem/mceliece460896/vec/util.c b/crypto_kem/mceliece460896/vec/util.c new file mode 100644 index 00000000..23620c1b --- /dev/null +++ b/crypto_kem/mceliece460896/vec/util.c @@ -0,0 +1,97 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ +#include "util.h" + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE460896_VEC_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE460896_VEC_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE460896_VEC_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE460896_VEC_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE460896_VEC_irr_load(vec out[][GFBITS], const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE460896_VEC_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[0][i] = v[0]; + out[1][i] = v[1]; + } +} + +void PQCLEAN_MCELIECE460896_VEC_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE460896_VEC_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} diff --git a/crypto_kem/mceliece460896/vec/util.h b/crypto_kem/mceliece460896/vec/util.h new file mode 100644 index 00000000..e2e88a62 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/util.h @@ -0,0 +1,26 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_UTIL_H +#define PQCLEAN_MCELIECE460896_VEC_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE460896_VEC_store_i(unsigned char *out, uint64_t in, int i); + +void PQCLEAN_MCELIECE460896_VEC_store2(unsigned char *dest, uint16_t a); + +uint16_t PQCLEAN_MCELIECE460896_VEC_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE460896_VEC_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE460896_VEC_irr_load(vec out[][GFBITS], const unsigned char *in); + +void PQCLEAN_MCELIECE460896_VEC_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE460896_VEC_load8(const unsigned char *in); + +#endif + diff --git a/crypto_kem/mceliece460896/vec/vec.c b/crypto_kem/mceliece460896/vec/vec.c new file mode 100644 index 00000000..e114c475 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/vec.c @@ -0,0 +1,139 @@ +#include "vec.h" + +#include "params.h" + +vec PQCLEAN_MCELIECE460896_VEC_vec_setbits(vec b) { + vec ret = -b; + + return ret; +} + +vec PQCLEAN_MCELIECE460896_VEC_vec_set1_16b(uint16_t v) { + vec ret; + + ret = v; + ret |= ret << 16; + ret |= ret << 32; + + return ret; +} + +void PQCLEAN_MCELIECE460896_VEC_vec_copy(vec *out, const vec *in) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i] = in[i]; + } +} + +vec PQCLEAN_MCELIECE460896_VEC_vec_or_reduce(const vec *a) { + int i; + vec ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret |= a[i]; + } + + return ret; +} + +int PQCLEAN_MCELIECE460896_VEC_vec_testz(vec a) { + a |= a >> 32; + a |= a >> 16; + a |= a >> 8; + a |= a >> 4; + a |= a >> 2; + a |= a >> 1; + + return ((int)a & 1) ^ 1; +} + + +void PQCLEAN_MCELIECE460896_VEC_vec_mul(vec *h, const vec *f, const vec *g) { + int i, j; + vec buf[ 2 * GFBITS - 1 ]; + + for (i = 0; i < 2 * GFBITS - 1; i++) { + buf[i] = 0; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < GFBITS; j++) { + buf[i + j] ^= f[i] & g[j]; + } + } + + for (i = 2 * GFBITS - 2; i >= GFBITS; i--) { + buf[i - GFBITS + 4] ^= buf[i]; + buf[i - GFBITS + 3] ^= buf[i]; + buf[i - GFBITS + 1] ^= buf[i]; + buf[i - GFBITS + 0] ^= buf[i]; + } + + for (i = 0; i < GFBITS; i++) { + h[i] = buf[i]; + } +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE460896_VEC_vec_sq(vec *out, const vec *in) { + int i; + vec result[GFBITS], t; + + t = in[11] ^ in[12]; + + result[0] = in[0] ^ in[11]; + result[1] = in[7] ^ t; + result[2] = in[1] ^ in[7]; + result[3] = in[8] ^ t; + result[4] = in[2] ^ in[7]; + result[4] = result[4] ^ in[8]; + result[4] = result[4] ^ t; + result[5] = in[7] ^ in[9]; + result[6] = in[3] ^ in[8]; + result[6] = result[6] ^ in[9]; + result[6] = result[6] ^ in[12]; + result[7] = in[8] ^ in[10]; + result[8] = in[4] ^ in[9]; + result[8] = result[8] ^ in[10]; + result[9] = in[9] ^ in[11]; + result[10] = in[5] ^ in[10]; + result[10] = result[10] ^ in[11]; + result[11] = in[10] ^ in[12]; + result[12] = in[6] ^ t; + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE460896_VEC_vec_inv(vec *out, const vec *in) { + vec tmp_11[ GFBITS ]; + vec tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE460896_VEC_vec_copy(out, in); + + PQCLEAN_MCELIECE460896_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896_VEC_vec_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE460896_VEC_vec_sq(out, tmp_11); + PQCLEAN_MCELIECE460896_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896_VEC_vec_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE460896_VEC_vec_sq(out, tmp_1111); + PQCLEAN_MCELIECE460896_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896_VEC_vec_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE460896_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896_VEC_vec_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE460896_VEC_vec_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece460896/vec/vec.h b/crypto_kem/mceliece460896/vec/vec.h new file mode 100644 index 00000000..86b8e356 --- /dev/null +++ b/crypto_kem/mceliece460896/vec/vec.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE460896_VEC_VEC_H +#define PQCLEAN_MCELIECE460896_VEC_VEC_H + +#include "params.h" + +#include + +typedef uint64_t vec; + +vec PQCLEAN_MCELIECE460896_VEC_vec_setbits(vec b); + +vec PQCLEAN_MCELIECE460896_VEC_vec_set1_16b(uint16_t v); + +void PQCLEAN_MCELIECE460896_VEC_vec_copy(vec *out, const vec *in); + +vec PQCLEAN_MCELIECE460896_VEC_vec_or_reduce(const vec *a); + +int PQCLEAN_MCELIECE460896_VEC_vec_testz(vec a); + +void PQCLEAN_MCELIECE460896_VEC_vec_mul(vec * /*h*/, const vec * /*f*/, const vec * /*g*/); +void PQCLEAN_MCELIECE460896_VEC_vec_sq(vec * /*out*/, const vec * /*in*/); +void PQCLEAN_MCELIECE460896_VEC_vec_inv(vec * /*out*/, const vec * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/META.yml b/crypto_kem/mceliece460896f/META.yml new file mode 100644 index 00000000..b89ed494 --- /dev/null +++ b/crypto_kem/mceliece460896f/META.yml @@ -0,0 +1,50 @@ +name: Classic McEliece 460896f +type: kem +claimed-nist-level: 3 +claimed-security: IND-CCA2 +length-public-key: 524160 +length-secret-key: 13568 +length-ciphertext: 188 +length-shared-secret: 32 +nistkat-sha256: b0822a5d00d7fad26380044c77b33370a5fb38e7851263229f590cac323a46a7 +principal-submitters: + - Daniel J. Bernstein + - Tung Chou + - Tanja Lange + - Ingo von Maurich + - Rafael Misoczki + - Ruben Niederhagen + - Edoardo Persichetti + - Christiane Peters + - Peter Schwabe + - Nicolas Sendrier + - Jakub Szefer + - Wen Wang +auxiliary-submitters: [] +implementations: + - name: clean + version: SUPERCOP-20191221 + - name: vec + version: SUPERCOP-20191221 + - name: sse + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - sse4_1 + - bmi1 + - popcnt + - name: avx + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi1 + - popcnt diff --git a/crypto_kem/mceliece460896f/avx/LICENSE b/crypto_kem/mceliece460896f/avx/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece460896f/avx/Makefile b/crypto_kem/mceliece460896f/avx/Makefile new file mode 100644 index 00000000..101034b4 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/Makefile @@ -0,0 +1,44 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece460896f_avx.a + + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c int32_sort.c operations.c pk_gen.c sk_gen.c \ + transpose.c uint32_sort.c util.c vec128.c vec256.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S \ + transpose_64x256_sp_asm.S update_asm.S vec128_mul_asm.S \ + vec256_ama_asm.S vec256_maa_asm.S vec256_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h int32_sort.h \ + params.h pk_gen.h sk_gen.h transpose.h uint32_sort.h util.h vec128.h \ + vec256.h \ + consts.inc scalars_2x.inc scalars_4x.inc + + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o int32_sort.o operations.o pk_gen.o sk_gen.o \ + transpose.o uint32_sort.o util.o vec128.o vec256.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + transpose_64x256_sp_asm.o update_asm.o vec128_mul_asm.o \ + vec256_ama_asm.o vec256_maa_asm.o vec256_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mbmi -mpopcnt -mavx2 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece460896f/avx/aes256ctr.c b/crypto_kem/mceliece460896f/avx/aes256ctr.c new file mode 100644 index 00000000..344783b9 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE460896F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece460896f/avx/aes256ctr.h b/crypto_kem/mceliece460896f/avx/aes256ctr.h new file mode 100644 index 00000000..9d778c90 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_AES256CTR_H +#define PQCLEAN_MCELIECE460896F_AVX_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE460896F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece460896f/avx/api.h b/crypto_kem/mceliece460896f/avx/api.h new file mode 100644 index 00000000..674fa277 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_API_H +#define PQCLEAN_MCELIECE460896F_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE460896F_AVX_CRYPTO_ALGNAME "Classic McEliece 460896f" +#define PQCLEAN_MCELIECE460896F_AVX_CRYPTO_PUBLICKEYBYTES 524160 +#define PQCLEAN_MCELIECE460896F_AVX_CRYPTO_SECRETKEYBYTES 13568 +#define PQCLEAN_MCELIECE460896F_AVX_CRYPTO_CIPHERTEXTBYTES 188 +#define PQCLEAN_MCELIECE460896F_AVX_CRYPTO_BYTES 32 + +int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece460896f/avx/benes.c b/crypto_kem/mceliece460896f/avx/benes.c new file mode 100644 index 00000000..defa2902 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE460896F_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE460896F_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(PQCLEAN_MCELIECE460896F_AVX_load8(ptr), PQCLEAN_MCELIECE460896F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE460896F_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(PQCLEAN_MCELIECE460896F_AVX_load8(ptr), PQCLEAN_MCELIECE460896F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE460896F_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece460896f/avx/benes.h b/crypto_kem/mceliece460896f/avx/benes.h new file mode 100644 index 00000000..5032513a --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_BENES_H +#define PQCLEAN_MCELIECE460896F_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE460896F_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE460896F_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/avx/bm.c b/crypto_kem/mceliece460896f/avx/bm.c new file mode 100644 index 00000000..e71ee94b --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/bm.c @@ -0,0 +1,210 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE460896F_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE460896F_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE460896F_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE460896F_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE460896F_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE460896F_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE460896F_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE460896F_AVX_vec256_or(PQCLEAN_MCELIECE460896F_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE460896F_AVX_vec256_sll_4x(PQCLEAN_MCELIECE460896F_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE460896F_AVX_vec256_or(PQCLEAN_MCELIECE460896F_AVX_vec256_srl_4x(PQCLEAN_MCELIECE460896F_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE460896F_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896F_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896F_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896F_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896F_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE460896F_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + uint64_t v[2]; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + vec128 db[ GFBITS ][ 2 ]; + vec128 BC_tmp[ GFBITS ][ 2 ]; + vec128 BC[ GFBITS ][ 2 ]; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC[0][0] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0, one << 62); + BC[0][1] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0, one << 63); + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = PQCLEAN_MCELIECE460896F_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_setzero(); + } + + for (N = 0; N < SYS_T * 2; N++) { + PQCLEAN_MCELIECE460896F_AVX_update_asm(interval, coefs[N], 16); + PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm(prod, interval, BC[0] + 1, 32); + + d = PQCLEAN_MCELIECE460896F_AVX_vec_reduce_asm(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = PQCLEAN_MCELIECE460896F_AVX_vec128_setbits((d >> i) & 1); + db[i][1] = PQCLEAN_MCELIECE460896F_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_mul((vec256 *) BC_tmp, (vec256 *) db, (vec256 *) BC); + + vec128_cmov(BC, mask); + PQCLEAN_MCELIECE460896F_AVX_update_asm(BC, 0, 32); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(BC_tmp[i][0], BC_tmp[i][1]); + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(BC[i][1], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(BC[i][1], 1); + + out[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x((v[0] >> 31) | (v[1] << 33), v[1] >> 31); + } +} + diff --git a/crypto_kem/mceliece460896f/avx/bm.h b/crypto_kem/mceliece460896f/avx/bm.h new file mode 100644 index 00000000..b4cf2396 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_BM_H +#define PQCLEAN_MCELIECE460896F_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE460896F_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/avx/consts.S b/crypto_kem/mceliece460896f/avx/consts.S new file mode 100644 index 00000000..be963193 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE460896F_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE460896F_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE460896F_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE460896F_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE460896F_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE460896F_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE460896F_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE460896F_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE460896F_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE460896F_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE460896F_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE460896F_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE460896F_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece460896f/avx/consts.inc b/crypto_kem/mceliece460896f/avx/consts.inc new file mode 100644 index 00000000..9d1846bb --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/crypto_kem/mceliece460896f/avx/controlbits.c b/crypto_kem/mceliece460896f/avx/controlbits.c new file mode 100644 index 00000000..160ff39b --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE460896F_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE460896F_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE460896F_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE460896F_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece460896f/avx/controlbits.h b/crypto_kem/mceliece460896f/avx/controlbits.h new file mode 100644 index 00000000..2e0620f0 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE460896F_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE460896F_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE460896F_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece460896f/avx/crypto_hash.h b/crypto_kem/mceliece460896f/avx/crypto_hash.h new file mode 100644 index 00000000..476f5111 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE460896F_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece460896f/avx/decrypt.c b/crypto_kem/mceliece460896f/avx/decrypt.c new file mode 100644 index 00000000..a39557a2 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/decrypt.c @@ -0,0 +1,234 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE460896F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE460896F_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE460896F_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE460896F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE460896F_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE460896F_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE460896F_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE460896F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE460896F_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE460896F_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec256_or(diff, PQCLEAN_MCELIECE460896F_AVX_vec256_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE460896F_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE460896F_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 32 ][ GFBITS ]; + vec256 scaled[ 32 ][ GFBITS ]; + vec256 eval[32][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE460896F_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE460896F_AVX_benes(recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE460896F_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE460896F_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE460896F_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE460896F_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE460896F_AVX_benes(error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece460896f/avx/decrypt.h b/crypto_kem/mceliece460896f/avx/decrypt.h new file mode 100644 index 00000000..c4d05ca5 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE460896F_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE460896F_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/avx/encrypt.c b/crypto_kem/mceliece460896f/avx/encrypt.c new file mode 100644 index 00000000..162b5c66 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/encrypt.c @@ -0,0 +1,99 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE460896F_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE460896F_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + PQCLEAN_MCELIECE460896F_AVX_store8(e, e_int[i]); + e += 8; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE460896F_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE460896F_AVX_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece460896f/avx/encrypt.h b/crypto_kem/mceliece460896f/avx/encrypt.h new file mode 100644 index 00000000..17a83e3a --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE460896F_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE460896F_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/avx/fft.c b/crypto_kem/mceliece460896f/avx/fft.c new file mode 100644 index 00000000..e2cf502e --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/fft.c @@ -0,0 +1,262 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE460896F_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE460896F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE460896F_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE460896F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE460896F_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE460896F_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + // transpose + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE460896F_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE460896F_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece460896f/avx/fft.h b/crypto_kem/mceliece460896f/avx/fft.h new file mode 100644 index 00000000..cc1e3abe --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_FFT_H +#define PQCLEAN_MCELIECE460896F_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE460896F_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/avx/fft_tr.c b/crypto_kem/mceliece460896f/avx/fft_tr.c new file mode 100644 index 00000000..dc0533fd --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/fft_tr.c @@ -0,0 +1,398 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE460896F_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE460896F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE460896F_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE460896F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE460896F_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE460896F_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +/* justifying the length of the output */ +static void postprocess(vec256 *out) { + int i; + uint64_t v[4]; + + for (i = 0; i < 13; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(out[i], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(out[i], 1); + v[2] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(out[i], 2); + v[3] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(out[i], 3); + + v[3] = 0; + + out[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +void PQCLEAN_MCELIECE460896F_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/crypto_kem/mceliece460896f/avx/fft_tr.h b/crypto_kem/mceliece460896f/avx/fft_tr.h new file mode 100644 index 00000000..b43cd448 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE460896F_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE460896F_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece460896f/avx/gf.c b/crypto_kem/mceliece460896f/avx/gf.c new file mode 100644 index 00000000..790d218c --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/gf.c @@ -0,0 +1,205 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE460896F_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE460896F_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE460896F_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE460896F_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE460896F_AVX_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE460896F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[191]; + + for (i = 0; i < 191; i++) { + prod[i] = 0; + } + + for (i = 0; i < 96; i++) { + for (j = 0; j < 96; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE460896F_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 190; i >= 96; i--) { + prod[i - 85] ^= PQCLEAN_MCELIECE460896F_AVX_gf_mul(prod[i], (gf) 714); + prod[i - 91] ^= PQCLEAN_MCELIECE460896F_AVX_gf_mul(prod[i], (gf) 5296); + prod[i - 92] ^= PQCLEAN_MCELIECE460896F_AVX_gf_mul(prod[i], (gf) 728); + prod[i - 96] ^= PQCLEAN_MCELIECE460896F_AVX_gf_mul(prod[i], (gf) 5881); + } + + for (i = 0; i < 96; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece460896f/avx/gf.h b/crypto_kem/mceliece460896f/avx/gf.h new file mode 100644 index 00000000..67987f8b --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_GF_H +#define PQCLEAN_MCELIECE460896F_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE460896F_AVX_gf_iszero(gf a); +gf PQCLEAN_MCELIECE460896F_AVX_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE460896F_AVX_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE460896F_AVX_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE460896F_AVX_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE460896F_AVX_gf_inv(gf in); + +void PQCLEAN_MCELIECE460896F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece460896f/avx/int32_sort.c b/crypto_kem/mceliece460896f/avx/int32_sort.c new file mode 100644 index 00000000..fbae1dd7 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/int32_sort.c @@ -0,0 +1,1208 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = (p << 1 == q); + flipflip = !flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE460896F_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE460896F_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/crypto_kem/mceliece460896f/avx/int32_sort.h b/crypto_kem/mceliece460896f/avx/int32_sort.h new file mode 100644 index 00000000..ec7a3c0b --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE460896F_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE460896F_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece460896f/avx/operations.c b/crypto_kem/mceliece460896f/avx/operations.c new file mode 100644 index 00000000..703e5206 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE460896F_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE460896F_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE460896F_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE460896F_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE460896F_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE460896F_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE460896F_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE460896F_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE460896F_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE460896F_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896f/avx/params.h b/crypto_kem/mceliece460896f/avx/params.h new file mode 100644 index 00000000..d5923192 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_PARAMS_H +#define PQCLEAN_MCELIECE460896F_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 4608 +#define SYS_T 96 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece460896f/avx/pk_gen.c b/crypto_kem/mceliece460896f/avx/pk_gen.c new file mode 100644 index 00000000..dcb0007b --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/pk_gen.c @@ -0,0 +1,358 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 255) / 256) * 4 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = mat[ row + i ][ block_idx ]; + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = mat[ i + j ][ block_idx ]; + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx ] = buf[j]; + } + } + + return 0; +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256) +int PQCLEAN_MCELIECE460896F_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE460896F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE460896F_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE460896F_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE460896F_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + PQCLEAN_MCELIECE460896F_AVX_store_i(pk, mat[i][ NBLOCKS1_I - 1 ] >> tail, (64 - tail) / 8); + pk += (64 - tail) / 8; + + for (j = NBLOCKS1_I; j < NBLOCKS1_H; j++) { + PQCLEAN_MCELIECE460896F_AVX_store8(pk, mat[i][j]); + pk += 8; + } + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece460896f/avx/pk_gen.h b/crypto_kem/mceliece460896f/avx/pk_gen.h new file mode 100644 index 00000000..718b45ec --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE460896F_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE460896F_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/avx/scalars_2x.inc b/crypto_kem/mceliece460896f/avx/scalars_2x.inc new file mode 100644 index 00000000..1783f9db --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece460896f/avx/scalars_4x.inc b/crypto_kem/mceliece460896f/avx/scalars_4x.inc new file mode 100644 index 00000000..2614718b --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/crypto_kem/mceliece460896f/avx/sk_gen.c b/crypto_kem/mceliece460896f/avx/sk_gen.c new file mode 100644 index 00000000..ce68f366 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE460896F_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE460896F_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE460896F_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE460896F_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE460896F_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE460896F_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE460896F_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE460896F_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896f/avx/sk_gen.h b/crypto_kem/mceliece460896f/avx/sk_gen.h new file mode 100644 index 00000000..c80be906 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE460896F_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE460896F_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE460896F_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/avx/syndrome_asm.S b/crypto_kem/mceliece460896f/avx/syndrome_asm.S new file mode 100644 index 00000000..a65147dd --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/syndrome_asm.S @@ -0,0 +1,650 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896F_AVX_syndrome_asm +.global PQCLEAN_MCELIECE460896F_AVX_syndrome_asm +_PQCLEAN_MCELIECE460896F_AVX_syndrome_asm: +PQCLEAN_MCELIECE460896F_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 523740 +# asm 1: add $523740,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1248 +# asm 1: mov $1248,>row=int64#5 +# asm 2: mov $1248,>row=%r8 +mov $1248,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 156 ] +# asm 1: vmovupd 156(ee=reg256#2 +# asm 2: vmovupd 156(ee=%ymm1 +vmovupd 156(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 188 ] +# asm 1: vmovupd 188(ee=reg256#3 +# asm 2: vmovupd 188(ee=%ymm2 +vmovupd 188(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 220 ] +# asm 1: vmovupd 220(ee=reg256#3 +# asm 2: vmovupd 220(ee=%ymm2 +vmovupd 220(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 252 ] +# asm 1: vmovupd 252(ee=reg256#3 +# asm 2: vmovupd 252(ee=%ymm2 +vmovupd 252(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 284 ] +# asm 1: vmovupd 284(ee=reg256#3 +# asm 2: vmovupd 284(ee=%ymm2 +vmovupd 284(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 316 ] +# asm 1: vmovupd 316(ee=reg256#3 +# asm 2: vmovupd 316(ee=%ymm2 +vmovupd 316(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 348 ] +# asm 1: vmovupd 348(ee=reg256#3 +# asm 2: vmovupd 348(ee=%ymm2 +vmovupd 348(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 380 ] +# asm 1: vmovupd 380(ee=reg256#3 +# asm 2: vmovupd 380(ee=%ymm2 +vmovupd 380(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 412 ] +# asm 1: vmovupd 412(ee=reg256#3 +# asm 2: vmovupd 412(ee=%ymm2 +vmovupd 412(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 444 ] +# asm 1: vmovupd 444(ee=reg256#3 +# asm 2: vmovupd 444(ee=%ymm2 +vmovupd 444(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 476 ] +# asm 1: vmovupd 476(ee=reg256#3 +# asm 2: vmovupd 476(ee=%ymm2 +vmovupd 476(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 508 ] +# asm 1: vmovupd 508(ee=reg256#3 +# asm 2: vmovupd 508(ee=%ymm2 +vmovupd 508(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 540 ] +# asm 1: vmovupd 540(ee=reg256#3 +# asm 2: vmovupd 540(ee=%ymm2 +vmovupd 540(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = *(uint32 *)(input_1 + 416) +# asm 1: movl 416(s=int64#6d +# asm 2: movl 416(s=%r9d +movl 416(%rsi),%r9d + +# qhasm: e = *(uint32 *)(input_2 + 572) +# asm 1: movl 572(e=int64#7d +# asm 2: movl 572(e=%eax +movl 572(%rdx),%eax + +# qhasm: s &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 96(ss=%ymm0 +vmovupd 96(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor s=int64#2 +# asm 2: movq 128(s=%rsi +movq 128(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 128 ] +# asm 1: movq 128(e=int64#4 +# asm 2: movq 128(e=%rcx +movq 128(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 136(s=%rsi +movq 136(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 136 ] +# asm 1: movq 136(e=int64#4 +# asm 2: movq 136(e=%rcx +movq 136(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 144(s=%rsi +movq 144(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 144 ] +# asm 1: movq 144(e=int64#4 +# asm 2: movq 144(e=%rcx +movq 144(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2d +# asm 2: movl 152(s=%esi +movl 152(%rdi),%esi + +# qhasm: e = *(uint32 *)( input_2 + 152 ) +# asm 1: movl 152(e=int64#3d +# asm 2: movl 152(e=%edx +movl 152(%rdx),%edx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE460896F_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece460896f/avx/update_asm.S b/crypto_kem/mceliece460896f/avx/update_asm.S new file mode 100644 index 00000000..b819b3a9 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896F_AVX_update_asm +.global PQCLEAN_MCELIECE460896F_AVX_update_asm +_PQCLEAN_MCELIECE460896F_AVX_update_asm: +PQCLEAN_MCELIECE460896F_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE460896F_AVX_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE460896F_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE460896F_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE460896F_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE460896F_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE460896F_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE460896F_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE460896F_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE460896F_AVX_vec128_set2x( PQCLEAN_MCELIECE460896F_AVX_load8(in), PQCLEAN_MCELIECE460896F_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE460896F_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE460896F_AVX_store8(out + 0, PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE460896F_AVX_store8(out + 8, PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece460896f/avx/util.h b/crypto_kem/mceliece460896f/avx/util.h new file mode 100644 index 00000000..54a5efd6 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_UTIL_H +#define PQCLEAN_MCELIECE460896F_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE460896F_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE460896F_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE460896F_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE460896F_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE460896F_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE460896F_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE460896F_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE460896F_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE460896F_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece460896f/avx/vec128.c b/crypto_kem/mceliece460896f/avx/vec128.c new file mode 100644 index 00000000..4cb57f5e --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE460896F_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE460896F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE460896F_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/crypto_kem/mceliece460896f/avx/vec128.h b/crypto_kem/mceliece460896f/avx/vec128.h new file mode 100644 index 00000000..6dd2c6cd --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_VEC128_H +#define PQCLEAN_MCELIECE460896F_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE460896F_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE460896F_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE460896F_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE460896F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/crypto_kem/mceliece460896f/avx/vec128_mul_asm.S b/crypto_kem/mceliece460896f/avx/vec128_mul_asm.S new file mode 100644 index 00000000..ad187dd3 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE460896F_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE460896F_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE460896F_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896F_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE460896F_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE460896F_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE460896F_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/crypto_kem/mceliece460896f/avx/vec256_ama_asm.S b/crypto_kem/mceliece460896f/avx/vec256_ama_asm.S new file mode 100644 index 00000000..1b9a7070 --- /dev/null +++ b/crypto_kem/mceliece460896f/avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE460896F_CLEAN_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece460896f/clean/api.h b/crypto_kem/mceliece460896f/clean/api.h new file mode 100644 index 00000000..26b75a71 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_API_H +#define PQCLEAN_MCELIECE460896F_CLEAN_API_H + +#include + +#define PQCLEAN_MCELIECE460896F_CLEAN_CRYPTO_ALGNAME "Classic McEliece 460896f" +#define PQCLEAN_MCELIECE460896F_CLEAN_CRYPTO_PUBLICKEYBYTES 524160 +#define PQCLEAN_MCELIECE460896F_CLEAN_CRYPTO_SECRETKEYBYTES 13568 +#define PQCLEAN_MCELIECE460896F_CLEAN_CRYPTO_CIPHERTEXTBYTES 188 +#define PQCLEAN_MCELIECE460896F_CLEAN_CRYPTO_BYTES 32 + +int PQCLEAN_MCELIECE460896F_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE460896F_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE460896F_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece460896f/clean/benes.c b/crypto_kem/mceliece460896f/clean/benes.c new file mode 100644 index 00000000..de09c1d1 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/benes.c @@ -0,0 +1,180 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE460896F_CLEAN_apply_benes(unsigned char *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + unsigned char *r_ptr = r; + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = PQCLEAN_MCELIECE460896F_CLEAN_load8(r_ptr + i * 16 + 0); + r_int_v[1][i] = PQCLEAN_MCELIECE460896F_CLEAN_load8(r_ptr + i * 16 + 8); + } + + PQCLEAN_MCELIECE460896F_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE460896F_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE460896F_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE460896F_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE460896F_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE460896F_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE460896F_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE460896F_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE460896F_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE460896F_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE460896F_CLEAN_store8(r_ptr + i * 16 + 0, r_int_v[0][i]); + PQCLEAN_MCELIECE460896F_CLEAN_store8(r_ptr + i * 16 + 8, r_int_v[1][i]); + } +} + +/* input: condition bits c */ +/* output: support s */ +void PQCLEAN_MCELIECE460896F_CLEAN_support_gen(gf *s, const unsigned char *c) { + gf a; + int i, j; + unsigned char L[ GFBITS ][ (1 << GFBITS) / 8 ]; + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < (1 << GFBITS) / 8; j++) { + L[i][j] = 0; + } + } + + for (i = 0; i < (1 << GFBITS); i++) { + a = PQCLEAN_MCELIECE460896F_CLEAN_bitrev((gf) i); + + for (j = 0; j < GFBITS; j++) { + L[j][ i / 8 ] |= ((a >> j) & 1) << (i % 8); + } + } + + for (j = 0; j < GFBITS; j++) { + PQCLEAN_MCELIECE460896F_CLEAN_apply_benes(L[j], c, 0); + } + + for (i = 0; i < SYS_N; i++) { + s[i] = 0; + for (j = GFBITS - 1; j >= 0; j--) { + s[i] <<= 1; + s[i] |= (L[j][i / 8] >> (i % 8)) & 1; + } + } +} + diff --git a/crypto_kem/mceliece460896f/clean/benes.h b/crypto_kem/mceliece460896f/clean/benes.h new file mode 100644 index 00000000..df4e7245 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/benes.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_BENES_H +#define PQCLEAN_MCELIECE460896F_CLEAN_BENES_H +/* + This file is for Benes network related functions +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE460896F_CLEAN_apply_benes(uint8_t *r, const uint8_t *bits, int rev); +void PQCLEAN_MCELIECE460896F_CLEAN_support_gen(gf *s, const uint8_t *c); + +#endif + diff --git a/crypto_kem/mceliece460896f/clean/bm.c b/crypto_kem/mceliece460896f/clean/bm.c new file mode 100644 index 00000000..0961b695 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/bm.c @@ -0,0 +1,83 @@ +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ +#include "bm.h" + +#include "params.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +/* the Berlekamp-Massey algorithm */ +/* input: s, sequence of field elements */ +/* output: out, minimal polynomial of s */ +void PQCLEAN_MCELIECE460896F_CLEAN_bm(gf *out, gf *s) { + int i; + + uint16_t N = 0; + uint16_t L = 0; + uint16_t mle; + uint16_t mne; + + gf T[ SYS_T + 1 ]; + gf C[ SYS_T + 1 ]; + gf B[ SYS_T + 1 ]; + + gf b = 1, d, f; + + // + + for (i = 0; i < SYS_T + 1; i++) { + C[i] = B[i] = 0; + } + + B[1] = C[0] = 1; + + // + + for (N = 0; N < 2 * SYS_T; N++) { + d = 0; + + for (i = 0; i <= min(N, SYS_T); i++) { + d ^= PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(C[i], s[ N - i]); + } + + mne = d; + mne -= 1; + mne >>= 15; + mne -= 1; + mle = N; + mle -= 2 * L; + mle >>= 15; + mle -= 1; + mle &= mne; + + for (i = 0; i <= SYS_T; i++) { + T[i] = C[i]; + } + + f = PQCLEAN_MCELIECE460896F_CLEAN_gf_frac(b, d); + + for (i = 0; i <= SYS_T; i++) { + C[i] ^= PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(f, B[i]) & mne; + } + + L = (L & ~mle) | ((N + 1 - L) & mle); + + for (i = 0; i <= SYS_T; i++) { + B[i] = (B[i] & ~mle) | (T[i] & mle); + } + + b = (b & ~mle) | (d & mle); + + for (i = SYS_T; i >= 1; i--) { + B[i] = B[i - 1]; + } + B[0] = 0; + } + + for (i = 0; i <= SYS_T; i++) { + out[i] = C[ SYS_T - i ]; + } +} + diff --git a/crypto_kem/mceliece460896f/clean/bm.h b/crypto_kem/mceliece460896f/clean/bm.h new file mode 100644 index 00000000..d5f689a5 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/bm.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_BM_H +#define PQCLEAN_MCELIECE460896F_CLEAN_BM_H +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE460896F_CLEAN_bm(gf * /*out*/, gf * /*s*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/clean/controlbits.c b/crypto_kem/mceliece460896f/clean/controlbits.c new file mode 100644 index 00000000..2c077f2c --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE460896F_CLEAN_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE460896F_CLEAN_sort_63b(n / 2, x); + PQCLEAN_MCELIECE460896F_CLEAN_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE460896F_CLEAN_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece460896f/clean/controlbits.h b/crypto_kem/mceliece460896f/clean/controlbits.h new file mode 100644 index 00000000..3fde760f --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_CONTROLBITS_H +#define PQCLEAN_MCELIECE460896F_CLEAN_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE460896F_CLEAN_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE460896F_CLEAN_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece460896f/clean/crypto_hash.h b/crypto_kem/mceliece460896f/clean/crypto_hash.h new file mode 100644 index 00000000..d67fd7d1 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE460896F_CLEAN_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece460896f/clean/decrypt.c b/crypto_kem/mceliece460896f/clean/decrypt.c new file mode 100644 index 00000000..9d9a2493 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/decrypt.c @@ -0,0 +1,90 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "gf.h" +#include "params.h" +#include "root.h" +#include "synd.h" +#include "util.h" + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE460896F_CLEAN_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i, w = 0; + uint16_t check; + + unsigned char r[ SYS_N / 8 ]; + + gf g[ SYS_T + 1 ]; + gf L[ SYS_N ]; + + gf s[ SYS_T * 2 ]; + gf s_cmp[ SYS_T * 2 ]; + gf locator[ SYS_T + 1 ]; + gf images[ SYS_N ]; + + gf t; + + // + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = c[i]; + } + for (i = SYND_BYTES; i < SYS_N / 8; i++) { + r[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE460896F_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + g[ SYS_T ] = 1; + + PQCLEAN_MCELIECE460896F_CLEAN_support_gen(L, sk); + + PQCLEAN_MCELIECE460896F_CLEAN_synd(s, g, L, r); + + PQCLEAN_MCELIECE460896F_CLEAN_bm(locator, s); + + PQCLEAN_MCELIECE460896F_CLEAN_root(images, locator, L); + + // + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + } + + for (i = 0; i < SYS_N; i++) { + t = PQCLEAN_MCELIECE460896F_CLEAN_gf_iszero(images[i]) & 1; + + e[ i / 8 ] |= t << (i % 8); + w += t; + + } + + PQCLEAN_MCELIECE460896F_CLEAN_synd(s_cmp, g, L, e); + + // + + check = (uint16_t)w; + check ^= SYS_T; + + for (i = 0; i < SYS_T * 2; i++) { + check |= s[i] ^ s_cmp[i]; + } + + check -= 1; + check >>= 15; + + return check ^ 1; +} + diff --git a/crypto_kem/mceliece460896f/clean/decrypt.h b/crypto_kem/mceliece460896f/clean/decrypt.h new file mode 100644 index 00000000..7084e2d6 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_DECRYPT_H +#define PQCLEAN_MCELIECE460896F_CLEAN_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE460896F_CLEAN_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/clean/encrypt.c b/crypto_kem/mceliece460896f/clean/encrypt.c new file mode 100644 index 00000000..6d7c16c7 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/encrypt.c @@ -0,0 +1,138 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include + +#include "gf.h" + +static inline uint8_t same_mask(uint16_t x, uint16_t y) { + uint32_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 31; + mask = -mask; + + return (uint8_t)mask; +} + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint8_t *ind_8 = (uint8_t *)ind_; + uint16_t ind[ SYS_T * 2 ]; + uint8_t mask; + unsigned char val[ SYS_T ]; + + while (1) { + randombytes(ind_8, sizeof(ind_)); + // Copy to uint16_t ind_ in a little-endian way + for (i = 0; i < sizeof(ind_); i += 2) { + ind_[i / 2] = ((uint16_t)ind_8[i + 1]) << 8 | (uint16_t)ind_8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = 1 << (ind[j] & 7); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = same_mask((uint16_t)i, (ind[j] >> 3)); + + e[i] |= val[j] & mask; + } + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + unsigned char b, row[SYS_N / 8]; + const unsigned char *pk_ptr = pk; + + int i, j; + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = 0; + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + row[j] = 0; + } + + for (j = 0; j < PK_ROW_BYTES; j++) { + row[ SYS_N / 8 - PK_ROW_BYTES + j ] = pk_ptr[j]; + } + + row[i / 8] |= 1 << (i % 8); + + b = 0; + for (j = 0; j < SYS_N / 8; j++) { + b ^= row[j] & e[j]; + } + + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] |= (b << (i % 8)); + + pk_ptr += PK_ROW_BYTES; + } +} + +void PQCLEAN_MCELIECE460896F_CLEAN_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece460896f/clean/encrypt.h b/crypto_kem/mceliece460896f/clean/encrypt.h new file mode 100644 index 00000000..b3677941 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_ENCRYPT_H +#define PQCLEAN_MCELIECE460896F_CLEAN_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE460896F_CLEAN_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/clean/gf.c b/crypto_kem/mceliece460896f/clean/gf.c new file mode 100644 index 00000000..bb82d8df --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/gf.c @@ -0,0 +1,211 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE460896F_CLEAN_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE460896F_CLEAN_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* input: field element in */ +/* return: (in^2)^2 */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: (in^2)*m */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: ((in^2)^2)*m */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE460896F_CLEAN_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +gf PQCLEAN_MCELIECE460896F_CLEAN_gf_inv(gf in) { + return PQCLEAN_MCELIECE460896F_CLEAN_gf_frac(in, ((gf) 1)); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE460896F_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 11] ^= PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(prod[i], (gf) 714); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(prod[i], (gf) 5296); + prod[i - SYS_T + 4] ^= PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(prod[i], (gf) 728); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(prod[i], (gf) 5881); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece460896f/clean/gf.h b/crypto_kem/mceliece460896f/clean/gf.h new file mode 100644 index 00000000..b11c0866 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_GF_H +#define PQCLEAN_MCELIECE460896F_CLEAN_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE460896F_CLEAN_gf_iszero(gf a); +gf PQCLEAN_MCELIECE460896F_CLEAN_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(gf in0, gf in1); +gf PQCLEAN_MCELIECE460896F_CLEAN_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE460896F_CLEAN_gf_inv(gf in); +uint64_t PQCLEAN_MCELIECE460896F_CLEAN_gf_mul2(gf a, gf b0, gf b1); + +void PQCLEAN_MCELIECE460896F_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece460896f/clean/operations.c b/crypto_kem/mceliece460896f/clean/operations.c new file mode 100644 index 00000000..7bc1b29a --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE460896F_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE460896F_CLEAN_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE460896F_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE460896F_CLEAN_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE460896F_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE460896F_CLEAN_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE460896F_CLEAN_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE460896F_CLEAN_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE460896F_CLEAN_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE460896F_CLEAN_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE460896F_CLEAN_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE460896F_CLEAN_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE460896F_CLEAN_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896f/clean/params.h b/crypto_kem/mceliece460896f/clean/params.h new file mode 100644 index 00000000..55021006 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_PARAMS_H +#define PQCLEAN_MCELIECE460896F_CLEAN_PARAMS_H + +#define GFBITS 13 +#define SYS_N 4608 +#define SYS_T 96 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece460896f/clean/pk_gen.c b/crypto_kem/mceliece460896f/clean/pk_gen.c new file mode 100644 index 00000000..f285e631 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/pk_gen.c @@ -0,0 +1,294 @@ +/* + This file is for public-key generation +*/ + +#include +#include +#include +#include + +#include "controlbits.h" +#include "benes.h" +#include "params.h" +#include "pk_gen.h" +#include "root.h" +#include "util.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + int i, b, m = 0, r = 0; + + for (i = 0; i < 64; i++) { + b = (int)(in >> i) & 1; + m |= b; + r += (m ^ 1) & (b ^ 1); + } + + return r; +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint8_t mat[][ SYS_N / 8 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 8; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = PQCLEAN_MCELIECE460896F_CLEAN_load8( &mat[ row + i ][ block_idx ] ); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = PQCLEAN_MCELIECE460896F_CLEAN_load8( &mat[ i + j ][ block_idx ] ); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + PQCLEAN_MCELIECE460896F_CLEAN_store8( &mat[ i + j ][ block_idx ], buf[j] ); + } + } + + return 0; +} + +/* input: secret key sk */ +/* output: public key pk */ +int PQCLEAN_MCELIECE460896F_CLEAN_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) { + int i, j, k; + int row, c; + + uint64_t buf[ 1 << GFBITS ]; + + unsigned char mat[ GFBITS * SYS_T ][ SYS_N / 8 ]; + unsigned char mask; + unsigned char b; + + gf g[ SYS_T + 1 ]; // Goppa polynomial + gf L[ SYS_N ]; // support + gf inv[ SYS_N ]; + + // + + g[ SYS_T ] = 1; + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE460896F_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + + for (i = 0; i < (1 << GFBITS); i++) { + buf[i] = perm[i]; + buf[i] <<= 31; + buf[i] |= i; + } + + PQCLEAN_MCELIECE460896F_CLEAN_sort_63b(1 << GFBITS, buf); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = buf[i] & GFMASK; + } + for (i = 0; i < SYS_N; i++) { + L[i] = PQCLEAN_MCELIECE460896F_CLEAN_bitrev((gf)perm[i]); + } + + // filling the matrix + + PQCLEAN_MCELIECE460896F_CLEAN_root(inv, g, L); + + for (i = 0; i < SYS_N; i++) { + inv[i] = PQCLEAN_MCELIECE460896F_CLEAN_gf_inv(inv[i]); + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + mat[i][j] = 0; + } + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_N; j += 8) { + for (k = 0; k < GFBITS; k++) { + b = (inv[j + 7] >> k) & 1; + b <<= 1; + b |= (inv[j + 6] >> k) & 1; + b <<= 1; + b |= (inv[j + 5] >> k) & 1; + b <<= 1; + b |= (inv[j + 4] >> k) & 1; + b <<= 1; + b |= (inv[j + 3] >> k) & 1; + b <<= 1; + b |= (inv[j + 2] >> k) & 1; + b <<= 1; + b |= (inv[j + 1] >> k) & 1; + b <<= 1; + b |= (inv[j + 0] >> k) & 1; + + mat[ i * GFBITS + k ][ j / 8 ] = b; + } + } + + for (j = 0; j < SYS_N; j++) { + inv[j] = PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(inv[j], L[j]); + } + + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) { + for (j = 0; j < 8; j++) { + row = i * 8 + j; + + if (row >= GFBITS * SYS_T) { + break; + } + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] ^ mat[ k ][ i ]; + mask >>= j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < GFBITS * SYS_T; k++) { + if (k != row) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + } + + for (i = 0; i < PK_NROWS; i++) { + memcpy(pk + i * PK_ROW_BYTES, mat[i] + PK_NROWS / 8, PK_ROW_BYTES); + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896f/clean/pk_gen.h b/crypto_kem/mceliece460896f/clean/pk_gen.h new file mode 100644 index 00000000..b581baf3 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_PK_GEN_H +#define PQCLEAN_MCELIECE460896F_CLEAN_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE460896F_CLEAN_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/clean/root.c b/crypto_kem/mceliece460896f/clean/root.c new file mode 100644 index 00000000..b96f7a29 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/root.c @@ -0,0 +1,33 @@ +/* + This file is for evaluating a polynomial at one or more field elements +*/ +#include "root.h" + +#include "params.h" + +/* input: polynomial f and field element a */ +/* return f(a) */ +gf PQCLEAN_MCELIECE460896F_CLEAN_eval(gf *f, gf a) { + int i; + gf r; + + r = f[ SYS_T ]; + + for (i = SYS_T - 1; i >= 0; i--) { + r = PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(r, a); + r = PQCLEAN_MCELIECE460896F_CLEAN_gf_add(r, f[i]); + } + + return r; +} + +/* input: polynomial f and list of field elements L */ +/* output: out = [ f(a) for a in L ] */ +void PQCLEAN_MCELIECE460896F_CLEAN_root(gf *out, gf *f, gf *L) { + int i; + + for (i = 0; i < SYS_N; i++) { + out[i] = PQCLEAN_MCELIECE460896F_CLEAN_eval(f, L[i]); + } +} + diff --git a/crypto_kem/mceliece460896f/clean/root.h b/crypto_kem/mceliece460896f/clean/root.h new file mode 100644 index 00000000..8e4dde0c --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/root.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_ROOT_H +#define PQCLEAN_MCELIECE460896F_CLEAN_ROOT_H +/* + This file is for evaluating a polynomial at one or more field elements +*/ + + +#include "gf.h" + +gf PQCLEAN_MCELIECE460896F_CLEAN_eval(gf * /*f*/, gf /*a*/); +void PQCLEAN_MCELIECE460896F_CLEAN_root(gf * /*out*/, gf * /*f*/, gf * /*L*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/clean/sk_gen.c b/crypto_kem/mceliece460896f/clean/sk_gen.c new file mode 100644 index 00000000..59a5b48e --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE460896F_CLEAN_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE460896F_CLEAN_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE460896F_CLEAN_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE460896F_CLEAN_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE460896F_CLEAN_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE460896F_CLEAN_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896f/clean/sk_gen.h b/crypto_kem/mceliece460896f/clean/sk_gen.h new file mode 100644 index 00000000..19a69515 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_SK_GEN_H +#define PQCLEAN_MCELIECE460896F_CLEAN_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE460896F_CLEAN_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE460896F_CLEAN_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/clean/synd.c b/crypto_kem/mceliece460896f/clean/synd.c new file mode 100644 index 00000000..f14cdfa6 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/synd.c @@ -0,0 +1,33 @@ +/* + This file is for syndrome computation +*/ + +#include "synd.h" + +#include "params.h" +#include "root.h" + + +/* input: Goppa polynomial f, support L, received word r */ +/* output: out, the syndrome of length 2t */ +void PQCLEAN_MCELIECE460896F_CLEAN_synd(gf *out, gf *f, gf *L, const unsigned char *r) { + int i, j; + gf e, e_inv, c; + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = 0; + } + + for (i = 0; i < SYS_N; i++) { + c = (r[i / 8] >> (i % 8)) & 1; + + e = PQCLEAN_MCELIECE460896F_CLEAN_eval(f, L[i]); + e_inv = PQCLEAN_MCELIECE460896F_CLEAN_gf_inv(PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(e, e)); + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = PQCLEAN_MCELIECE460896F_CLEAN_gf_add(out[j], PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(e_inv, c)); + e_inv = PQCLEAN_MCELIECE460896F_CLEAN_gf_mul(e_inv, L[i]); + } + } +} + diff --git a/crypto_kem/mceliece460896f/clean/synd.h b/crypto_kem/mceliece460896f/clean/synd.h new file mode 100644 index 00000000..56e17fff --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/synd.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_SYND_H +#define PQCLEAN_MCELIECE460896F_CLEAN_SYND_H +/* + This file is for syndrome computation +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE460896F_CLEAN_synd(gf * /*out*/, gf * /*f*/, gf * /*L*/, const unsigned char * /*r*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/clean/transpose.c b/crypto_kem/mceliece460896f/clean/transpose.c new file mode 100644 index 00000000..2db1e1f9 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/transpose.c @@ -0,0 +1,42 @@ +/* + This file is for matrix transposition +*/ + +#include "transpose.h" + +#include + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE460896F_CLEAN_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + diff --git a/crypto_kem/mceliece460896f/clean/transpose.h b/crypto_kem/mceliece460896f/clean/transpose.h new file mode 100644 index 00000000..97440d94 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/transpose.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_TRANSPOSE_H +#define PQCLEAN_MCELIECE460896F_CLEAN_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + +void PQCLEAN_MCELIECE460896F_CLEAN_transpose_64x64(uint64_t * /*out*/, const uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/clean/util.c b/crypto_kem/mceliece460896f/clean/util.c new file mode 100644 index 00000000..9692d86e --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/util.c @@ -0,0 +1,67 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ + +#include "util.h" + +#include "params.h" + +void PQCLEAN_MCELIECE460896F_CLEAN_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE460896F_CLEAN_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE460896F_CLEAN_load4(const unsigned char *in) { + int i; + uint32_t ret = in[3]; + + for (i = 2; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +void PQCLEAN_MCELIECE460896F_CLEAN_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE460896F_CLEAN_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE460896F_CLEAN_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 3; +} + diff --git a/crypto_kem/mceliece460896f/clean/util.h b/crypto_kem/mceliece460896f/clean/util.h new file mode 100644 index 00000000..7066a2d2 --- /dev/null +++ b/crypto_kem/mceliece460896f/clean/util.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE460896F_CLEAN_UTIL_H +#define PQCLEAN_MCELIECE460896F_CLEAN_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include + +void PQCLEAN_MCELIECE460896F_CLEAN_store2(unsigned char * /*dest*/, gf /*a*/); +uint16_t PQCLEAN_MCELIECE460896F_CLEAN_load2(const unsigned char * /*src*/); + +uint32_t PQCLEAN_MCELIECE460896F_CLEAN_load4(const unsigned char * /*in*/); + +void PQCLEAN_MCELIECE460896F_CLEAN_store8(unsigned char * /*out*/, uint64_t /*in*/); +uint64_t PQCLEAN_MCELIECE460896F_CLEAN_load8(const unsigned char * /*in*/); + +gf PQCLEAN_MCELIECE460896F_CLEAN_bitrev(gf /*a*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/sse/LICENSE b/crypto_kem/mceliece460896f/sse/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece460896f/sse/Makefile b/crypto_kem/mceliece460896f/sse/Makefile new file mode 100644 index 00000000..20882a53 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/Makefile @@ -0,0 +1,37 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece460896f_sse.a + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c operations.c pk_gen.c sk_gen.c util.c vec128.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S update_asm.S \ + vec128_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h params.h \ + pk_gen.h sk_gen.h transpose.h util.h vec128.h \ + consts.inc scalars_2x.inc scalars_4x.inc + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o operations.o pk_gen.o sk_gen.o util.o vec128.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + update_asm.o vec128_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mpopcnt -mbmi -msse4.1 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece460896f/sse/aes256ctr.c b/crypto_kem/mceliece460896f/sse/aes256ctr.c new file mode 100644 index 00000000..d8c0abb1 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE460896F_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece460896f/sse/aes256ctr.h b/crypto_kem/mceliece460896f/sse/aes256ctr.h new file mode 100644 index 00000000..90645953 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_AES256CTR_H +#define PQCLEAN_MCELIECE460896F_SSE_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE460896F_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece460896f/sse/api.h b/crypto_kem/mceliece460896f/sse/api.h new file mode 100644 index 00000000..4e38342e --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_API_H +#define PQCLEAN_MCELIECE460896F_SSE_API_H + +#include + +#define PQCLEAN_MCELIECE460896F_SSE_CRYPTO_ALGNAME "Classic McEliece 460896f" +#define PQCLEAN_MCELIECE460896F_SSE_CRYPTO_PUBLICKEYBYTES 524160 +#define PQCLEAN_MCELIECE460896F_SSE_CRYPTO_SECRETKEYBYTES 13568 +#define PQCLEAN_MCELIECE460896F_SSE_CRYPTO_CIPHERTEXTBYTES 188 +#define PQCLEAN_MCELIECE460896F_SSE_CRYPTO_BYTES 32 + +int PQCLEAN_MCELIECE460896F_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE460896F_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE460896F_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece460896f/sse/benes.c b/crypto_kem/mceliece460896f/sse/benes.c new file mode 100644 index 00000000..da143de4 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE460896F_SSE_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE460896F_SSE_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(PQCLEAN_MCELIECE460896F_SSE_load8(ptr), PQCLEAN_MCELIECE460896F_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE460896F_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE460896F_SSE_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(PQCLEAN_MCELIECE460896F_SSE_load8(ptr), PQCLEAN_MCELIECE460896F_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE460896F_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE460896F_SSE_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE460896F_SSE_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE460896F_SSE_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE460896F_SSE_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE460896F_SSE_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece460896f/sse/benes.h b/crypto_kem/mceliece460896f/sse/benes.h new file mode 100644 index 00000000..2ff3d752 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_BENES_H +#define PQCLEAN_MCELIECE460896F_SSE_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE460896F_SSE_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE460896F_SSE_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/sse/bm.c b/crypto_kem/mceliece460896f/sse/bm.c new file mode 100644 index 00000000..72639e18 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/bm.c @@ -0,0 +1,204 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +extern gf PQCLEAN_MCELIECE460896F_SSE_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE460896F_SSE_update_asm(vec128 *, gf); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline void vec128_cmov(vec128 *out, vec128 *in, uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE460896F_SSE_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE460896F_SSE_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE460896F_SSE_vec128_and(in[i], m0); + v1 = PQCLEAN_MCELIECE460896F_SSE_vec128_and(out[i], m1); + out[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_or(v0, v1); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE460896F_SSE_vec128_or(PQCLEAN_MCELIECE460896F_SSE_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE460896F_SSE_vec128_sll_2x(PQCLEAN_MCELIECE460896F_SSE_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE460896F_SSE_vec128_or(PQCLEAN_MCELIECE460896F_SSE_vec128_srl_2x(PQCLEAN_MCELIECE460896F_SSE_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE460896F_SSE_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE460896F_SSE_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE460896F_SSE_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE460896F_SSE_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE460896F_SSE_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE460896F_SSE_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE460896F_SSE_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE460896F_SSE_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE460896F_SSE_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896F_SSE_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896F_SSE_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE460896F_SSE_bm(vec128 *out, vec128 in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + uint64_t v[2]; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + vec128 dd[ GFBITS ], bb[ GFBITS ]; + vec128 B[ GFBITS ], C[ GFBITS ]; + vec128 B_tmp[ GFBITS ], C_tmp[ GFBITS ]; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[128], in[1]); + + C[0] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0, one << 63); + B[0] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0, one << 62); + + for (i = 1; i < GFBITS; i++) { + C[i] = B[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_setzero(); + } + + for (N = 0; N < SYS_T * 2; N++) { + PQCLEAN_MCELIECE460896F_SSE_update_asm(interval, coefs[N]); + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(prod, C, (vec128 *) interval); + d = PQCLEAN_MCELIECE460896F_SSE_vec_reduce_asm(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_setbits((d >> i) & 1); + bb[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(B_tmp, dd, B); + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(C_tmp, bb, C); + + vec128_cmov(B, C, mask); + PQCLEAN_MCELIECE460896F_SSE_update_asm(B, 0); + + for (i = 0; i < GFBITS; i++) { + C[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(B_tmp[i], C_tmp[i]); + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(C[i], 0); + v[1] = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(C[i], 1); + + out[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x((v[0] >> 31) | (v[1] << 33), v[1] >> 31); + } +} + diff --git a/crypto_kem/mceliece460896f/sse/bm.h b/crypto_kem/mceliece460896f/sse/bm.h new file mode 100644 index 00000000..4e365fd3 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/bm.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_BM_H +#define PQCLEAN_MCELIECE460896F_SSE_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE460896F_SSE_bm(vec128 * /*out*/, vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece460896f/sse/consts.S b/crypto_kem/mceliece460896f/sse/consts.S new file mode 100644 index 00000000..5deeb764 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/consts.S @@ -0,0 +1,32 @@ +.data + +# not supported on macos +#.section .rodata +.globl PQCLEAN_MCELIECE460896F_SSE_MASK0_0 +.globl PQCLEAN_MCELIECE460896F_SSE_MASK0_1 +.globl PQCLEAN_MCELIECE460896F_SSE_MASK1_0 +.globl PQCLEAN_MCELIECE460896F_SSE_MASK1_1 +.globl PQCLEAN_MCELIECE460896F_SSE_MASK2_0 +.globl PQCLEAN_MCELIECE460896F_SSE_MASK2_1 +.globl PQCLEAN_MCELIECE460896F_SSE_MASK3_0 +.globl PQCLEAN_MCELIECE460896F_SSE_MASK3_1 +.globl PQCLEAN_MCELIECE460896F_SSE_MASK4_0 +.globl PQCLEAN_MCELIECE460896F_SSE_MASK4_1 +.globl PQCLEAN_MCELIECE460896F_SSE_MASK5_0 +.globl PQCLEAN_MCELIECE460896F_SSE_MASK5_1 + +.p2align 4 + +PQCLEAN_MCELIECE460896F_SSE_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE460896F_SSE_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE460896F_SSE_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE460896F_SSE_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE460896F_SSE_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE460896F_SSE_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE460896F_SSE_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE460896F_SSE_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE460896F_SSE_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE460896F_SSE_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE460896F_SSE_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE460896F_SSE_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece460896f/sse/consts.inc b/crypto_kem/mceliece460896f/sse/consts.inc new file mode 100644 index 00000000..c9c5045e --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/consts.inc @@ -0,0 +1,967 @@ +// 64 +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +// 128 +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), +}, +// 256 +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9669966969966996, 0X6996699696699669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +// 512 +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +// 1024 +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +// 2048 +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +// 4096 +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +} diff --git a/crypto_kem/mceliece460896f/sse/controlbits.c b/crypto_kem/mceliece460896f/sse/controlbits.c new file mode 100644 index 00000000..bdbc5587 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE460896F_SSE_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE460896F_SSE_sort_63b(n / 2, x); + PQCLEAN_MCELIECE460896F_SSE_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE460896F_SSE_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece460896f/sse/controlbits.h b/crypto_kem/mceliece460896f/sse/controlbits.h new file mode 100644 index 00000000..a33065ad --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_CONTROLBITS_H +#define PQCLEAN_MCELIECE460896F_SSE_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE460896F_SSE_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE460896F_SSE_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece460896f/sse/crypto_hash.h b/crypto_kem/mceliece460896f/sse/crypto_hash.h new file mode 100644 index 00000000..96a38648 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE460896F_SSE_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece460896f/sse/decrypt.c b/crypto_kem/mceliece460896f/sse/decrypt.c new file mode 100644 index 00000000..448c906f --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/decrypt.c @@ -0,0 +1,204 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec128 out[][GFBITS], vec128 inv[][GFBITS], const unsigned char *sk, vec128 *recv) { + int i, j; + + vec128 irr_int[ GFBITS ]; + vec128 eval[64][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE460896F_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE460896F_SSE_fft(eval, irr_int); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE460896F_SSE_vec128_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE460896F_SSE_vec128_copy(inv[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896F_SSE_vec128_inv(tmp, inv[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896F_SSE_vec128_copy(inv[0], tmp); + + // + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE460896F_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE460896F_SSE_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE460896F_SSE_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE460896F_SSE_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec128 out[][GFBITS], vec128 inv[][GFBITS], vec128 *recv) { + int i, j; + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE460896F_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE460896F_SSE_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE460896F_SSE_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec128 s0[][ GFBITS ], vec128 s1[][ GFBITS ]) { + int i, j; + vec128 diff; + + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_or(PQCLEAN_MCELIECE460896F_SSE_vec128_xor(s0[0][0], s1[0][0]), + PQCLEAN_MCELIECE460896F_SSE_vec128_xor(s0[1][0], s1[1][0])); + + for (i = 0; i < 2; i++) { + for (j = 1; j < GFBITS; j++) { + diff = PQCLEAN_MCELIECE460896F_SSE_vec128_or(diff, PQCLEAN_MCELIECE460896F_SSE_vec128_xor(s0[i][j], s1[i][j])); + } + } + + return (uint16_t)PQCLEAN_MCELIECE460896F_SSE_vec128_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE460896F_SSE_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec128 inv[ 64 ][ GFBITS ]; + vec128 scaled[ 64 ][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + + vec128 error[ 64 ]; + + vec128 s_priv[ 2 ][ GFBITS ]; + vec128 s_priv_cmp[ 2 ][ GFBITS ]; + + vec128 locator[ GFBITS ]; + + vec128 recv[ 64 ]; + vec128 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE460896F_SSE_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE460896F_SSE_benes(recv, bits_int, 1); + + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE460896F_SSE_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE460896F_SSE_bm(locator, s_priv); + + PQCLEAN_MCELIECE460896F_SSE_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE460896F_SSE_vec128_setbits(1); + + for (i = 0; i < 64; i++) { + error[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_or_reduce(eval[i]); + error[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(error[i], allone); + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE460896F_SSE_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE460896F_SSE_benes(error, bits_int, 0); + + postprocess(e, error); + + check_weight = weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece460896f/sse/decrypt.h b/crypto_kem/mceliece460896f/sse/decrypt.h new file mode 100644 index 00000000..97063297 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_DECRYPT_H +#define PQCLEAN_MCELIECE460896F_SSE_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE460896F_SSE_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/sse/encrypt.c b/crypto_kem/mceliece460896f/sse/encrypt.c new file mode 100644 index 00000000..8f863c2b --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/encrypt.c @@ -0,0 +1,100 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE460896F_SSE_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind[ SYS_T * 2 ]; + uint32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind32[i] == ind32[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + PQCLEAN_MCELIECE460896F_SSE_store8(e, e_int[i]); + e += 8; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE460896F_SSE_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE460896F_SSE_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece460896f/sse/encrypt.h b/crypto_kem/mceliece460896f/sse/encrypt.h new file mode 100644 index 00000000..d7b95668 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_ENCRYPT_H +#define PQCLEAN_MCELIECE460896F_SSE_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE460896F_SSE_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/sse/fft.c b/crypto_kem/mceliece460896f/sse/fft.c new file mode 100644 index 00000000..baa9072f --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/fft.c @@ -0,0 +1,231 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec128.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE460896F_SSE_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE460896F_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE460896F_SSE_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE460896F_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(in, in, s[j]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec128 out[][ GFBITS ], const vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec128 t[ GFBITS ]; + vec128 pre[8][ GFBITS ]; + vec128 buf[64]; + + uint64_t v0, v1; + uint64_t consts_ptr = 1; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre[i + 0][j] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_low(tmp[j], tmp[j]); + pre[i + 1][j] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(PQCLEAN_MCELIECE460896F_SSE_vec128_extract(in[i], 0), + PQCLEAN_MCELIECE460896F_SSE_vec128_extract(in[i], 0) ^ PQCLEAN_MCELIECE460896F_SSE_vec128_extract(pre[6][i], 0)); + + buf[1] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[0], pre[0][i]); + buf[16] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[0], pre[4][i]); + buf[3] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[1], pre[1][i]); + buf[48] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[16], pre[5][i]); + buf[49] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[48], pre[0][i]); + buf[2] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[0], pre[1][i]); + buf[51] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[49], pre[1][i]); + buf[6] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[2], pre[2][i]); + buf[50] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[51], pre[0][i]); + buf[7] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[6], pre[0][i]); + buf[54] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[50], pre[2][i]); + buf[5] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[7], pre[1][i]); + buf[55] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[54], pre[0][i]); + buf[53] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[55], pre[1][i]); + buf[4] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[0], pre[2][i]); + buf[52] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[53], pre[0][i]); + buf[12] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[4], pre[3][i]); + buf[60] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[52], pre[3][i]); + buf[13] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[12], pre[0][i]); + buf[61] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[60], pre[0][i]); + buf[15] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[13], pre[1][i]); + buf[63] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[61], pre[1][i]); + buf[14] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[15], pre[0][i]); + buf[62] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[63], pre[0][i]); + buf[10] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[14], pre[2][i]); + buf[58] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[62], pre[2][i]); + buf[11] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[10], pre[0][i]); + buf[59] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[58], pre[0][i]); + buf[9] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[11], pre[1][i]); + buf[57] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[59], pre[1][i]); + buf[56] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[57], pre[0][i]); + buf[8] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[0], pre[3][i]); + buf[40] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[56], pre[4][i]); + buf[24] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[8], pre[4][i]); + buf[41] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[40], pre[0][i]); + buf[25] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[24], pre[0][i]); + buf[43] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[41], pre[1][i]); + buf[27] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[25], pre[1][i]); + buf[42] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[43], pre[0][i]); + buf[26] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[27], pre[0][i]); + buf[46] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[42], pre[2][i]); + buf[30] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[26], pre[2][i]); + buf[47] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[46], pre[0][i]); + buf[31] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[30], pre[0][i]); + buf[45] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[47], pre[1][i]); + buf[29] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[31], pre[1][i]); + buf[44] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[45], pre[0][i]); + buf[28] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[29], pre[0][i]); + buf[36] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[44], pre[3][i]); + buf[20] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[28], pre[3][i]); + buf[37] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[36], pre[0][i]); + buf[21] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[20], pre[0][i]); + buf[39] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[37], pre[1][i]); + buf[23] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[21], pre[1][i]); + buf[38] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[39], pre[0][i]); + buf[22] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[23], pre[0][i]); + buf[34] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[38], pre[2][i]); + buf[18] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[22], pre[2][i]); + buf[35] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[34], pre[0][i]); + buf[19] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[18], pre[0][i]); + buf[33] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[35], pre[1][i]); + buf[17] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[19], pre[1][i]); + buf[32] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[33], pre[0][i]); + + PQCLEAN_MCELIECE460896F_SSE_transpose_64x128_sp(buf); + + for (j = 0; j < 64; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 0; i <= 5; i++) { + s = 1 << i; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(tmp, out[k + s], (vec128 *) consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(out[k ][b], tmp[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(out[k + s][b], out[k][b]); + } + } + } + + consts_ptr += (1 << i); + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE460896F_SSE_fft(vec128 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece460896f/sse/fft.h b/crypto_kem/mceliece460896f/sse/fft.h new file mode 100644 index 00000000..6ffe7a87 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_FFT_H +#define PQCLEAN_MCELIECE460896F_SSE_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include + +void PQCLEAN_MCELIECE460896F_SSE_fft(vec128 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/sse/fft_tr.c b/crypto_kem/mceliece460896f/sse/fft_tr.c new file mode 100644 index 00000000..018261b3 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/fft_tr.c @@ -0,0 +1,354 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec128 in[][ GFBITS ]) { + int i, j, k; + vec128 t, x0, x1; + + uint64_t v0, v1, v2, v3; + + const vec128 mask[6][2] = { + { + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec128 s[6][2][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(in[1], in[1], s[j][1]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE460896F_SSE_vec128_and(in[0][i], mask[k][0]); + t = PQCLEAN_MCELIECE460896F_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE460896F_SSE_vec128_and(in[0][i], mask[k][1]); + t = PQCLEAN_MCELIECE460896F_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE460896F_SSE_vec128_and(in[1][i], mask[k][0]); + t = PQCLEAN_MCELIECE460896F_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[1][i], t); + + t = PQCLEAN_MCELIECE460896F_SSE_vec128_and(in[1][i], mask[k][1]); + t = PQCLEAN_MCELIECE460896F_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[1][i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + x0 = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_low(in[0][i], in[1][i]); + x1 = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_high(in[0][i], in[1][i]); + + x1 = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(x1, PQCLEAN_MCELIECE460896F_SSE_vec128_srl_2x(x0, 32)); + x1 = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(x1, PQCLEAN_MCELIECE460896F_SSE_vec128_sll_2x(x1, 32)); + + in[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_low(x0, x1); + in[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_high(x0, x1); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(in[0][i], 0); + v1 = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(in[0][i], 1); + v2 = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(in[1][i], 0); + v3 = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(in[1][i], 1); + + v3 ^= v2 ^= v1; + + in[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(v0, v1); + in[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(v2, v3); + } + + } +} + +static void butterflies_tr(vec128 out[][ GFBITS ], vec128 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec128 tmp0[ GFBITS ]; + vec128 tmp1[ GFBITS ]; + vec128 tmp[ GFBITS ]; + + vec128 pre[ 6 ][ GFBITS ]; + vec128 buf[ 64 ]; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 64; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 5; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[k][b], in[k + s][b]); + } + + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[k + s][b], tmp[b]); + } + } + } + } + + for (j = 0; j < 64; j += 2) { + for (i = 0; i < GFBITS; i++) { + tmp0[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_low(in[j][i], in[j + 1][i]); + tmp1[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_high(in[j][i], in[j + 1][i]); + } + + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(tmp0[b], tmp1[b]); + } + + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(tmp, tmp0, consts[0]); + + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(tmp1[b], tmp[b]); + } + + for (i = 0; i < GFBITS; i++) { + in[j + 0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_low(tmp0[i], tmp1[i]); + in[j + 1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_high(tmp0[i], tmp1[i]); + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 64; k++) { + buf[k] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE460896F_SSE_transpose_64x128_sp(buf); + + pre[0][i] = buf[32]; + buf[33] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[33], buf[32]); + pre[1][i] = buf[33]; + buf[35] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[35], buf[33]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[35]); + buf[34] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[34], buf[35]); + pre[2][i] = buf[34]; + buf[38] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[38], buf[34]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[38]); + buf[39] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[39], buf[38]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[39]); + buf[37] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[37], buf[39]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[37]); + buf[36] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[36], buf[37]); + pre[3][i] = buf[36]; + buf[44] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[44], buf[36]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[44]); + buf[45] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[45], buf[44]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[45]); + buf[47] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[47], buf[45]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[47]); + buf[46] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[46], buf[47]); + pre[2][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[2][i], buf[46]); + buf[42] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[42], buf[46]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[42]); + buf[43] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[43], buf[42]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[43]); + buf[41] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[41], buf[43]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[41]); + buf[40] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[40], buf[41]); + pre[4][i] = buf[40]; + buf[56] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[56], buf[40]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[56]); + buf[57] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[57], buf[56]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[57]); + buf[59] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[59], buf[57]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[59]); + buf[58] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[58], buf[59]); + pre[2][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[2][i], buf[58]); + buf[62] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[62], buf[58]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[62]); + buf[63] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[63], buf[62]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[63]); + buf[61] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[61], buf[63]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[61]); + buf[60] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[60], buf[61]); + pre[3][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[3][i], buf[60]); + buf[52] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[52], buf[60]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[52]); + buf[53] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[53], buf[52]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[53]); + buf[55] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[55], buf[53]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[55]); + buf[54] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[54], buf[55]); + pre[2][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[2][i], buf[54]); + buf[50] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[50], buf[54]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[50]); + buf[51] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[51], buf[50]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[51]); + buf[49] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[49], buf[51]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[49]); + buf[48] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[48], buf[49]); + pre[5][i] = buf[48]; + buf[16] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[16], buf[48]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[16]); + buf[17] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[17], buf[16]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[17]); + buf[19] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[19], buf[17]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[19]); + buf[18] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[18], buf[19]); + pre[2][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[2][i], buf[18]); + buf[22] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[22], buf[18]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[22]); + buf[23] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[23], buf[22]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[23]); + buf[21] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[21], buf[23]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[21]); + buf[20] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[20], buf[21]); + pre[3][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[3][i], buf[20]); + buf[28] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[28], buf[20]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[28]); + buf[29] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[29], buf[28]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[29]); + buf[31] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[31], buf[29]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[31]); + buf[30] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[30], buf[31]); + pre[2][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[2][i], buf[30]); + buf[26] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[26], buf[30]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[26]); + buf[27] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[27], buf[26]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[27]); + buf[25] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[25], buf[27]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[25]); + buf[24] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[24], buf[25]); + pre[4][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[4][i], buf[24]); + buf[8] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[8], buf[24]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[8]); + buf[9] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[9], buf[8]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[9]); + buf[11] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[11], buf[9]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[11]); + buf[10] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[10], buf[11]); + pre[2][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[2][i], buf[10]); + buf[14] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[14], buf[10]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[14]); + buf[15] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[15], buf[14]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[15]); + buf[13] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[13], buf[15]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[13]); + buf[12] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[12], buf[13]); + pre[3][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[3][i], buf[12]); + buf[4] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[4], buf[12]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[4]); + buf[5] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[5], buf[4]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[5]); + buf[7] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[7], buf[5]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[7]); + buf[6] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[6], buf[7]); + pre[2][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[2][i], buf[6]); + buf[2] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[2], buf[6]); + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[2]); + buf[3] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[3], buf[2]); + pre[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[1][i], buf[3]); + buf[1] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[1], buf[3]); + + pre[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(pre[0][i], buf[1]); + out[0][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(buf[0], buf[1]); + + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896F_SSE_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(out[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896F_SSE_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(tmp, pre[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out[1][b] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(out[1][b], tmp[b]); + } + } +} + +/* justifying the length of the output */ +static void postprocess(vec128 out[][GFBITS]) { + int i; + uint64_t v[2]; + + for (i = 0; i < 13; i++) { + v[0] = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(out[1][i], 0); + v[1] = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(out[1][i], 1); + + v[1] = 0; + + out[1][i] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE460896F_SSE_fft_tr(vec128 out[][GFBITS], vec128 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/crypto_kem/mceliece460896f/sse/fft_tr.h b/crypto_kem/mceliece460896f/sse/fft_tr.h new file mode 100644 index 00000000..23b4cb38 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_FFT_TR_H +#define PQCLEAN_MCELIECE460896F_SSE_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE460896F_SSE_fft_tr(vec128 /*out*/[][GFBITS], vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece460896f/sse/gf.c b/crypto_kem/mceliece460896f/sse/gf.c new file mode 100644 index 00000000..05611346 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/gf.c @@ -0,0 +1,205 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE460896F_SSE_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE460896F_SSE_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE460896F_SSE_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE460896F_SSE_gf_inv(gf in) { + return PQCLEAN_MCELIECE460896F_SSE_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE460896F_SSE_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[191]; + + for (i = 0; i < 191; i++) { + prod[i] = 0; + } + + for (i = 0; i < 96; i++) { + for (j = 0; j < 96; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE460896F_SSE_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 190; i >= 96; i--) { + prod[i - 85] ^= PQCLEAN_MCELIECE460896F_SSE_gf_mul(prod[i], (gf) 714); + prod[i - 91] ^= PQCLEAN_MCELIECE460896F_SSE_gf_mul(prod[i], (gf) 5296); + prod[i - 92] ^= PQCLEAN_MCELIECE460896F_SSE_gf_mul(prod[i], (gf) 728); + prod[i - 96] ^= PQCLEAN_MCELIECE460896F_SSE_gf_mul(prod[i], (gf) 5881); + } + + for (i = 0; i < 96; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece460896f/sse/gf.h b/crypto_kem/mceliece460896f/sse/gf.h new file mode 100644 index 00000000..5a230bf2 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_GF_H +#define PQCLEAN_MCELIECE460896F_SSE_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE460896F_SSE_gf_iszero(gf a); +gf PQCLEAN_MCELIECE460896F_SSE_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE460896F_SSE_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE460896F_SSE_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE460896F_SSE_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE460896F_SSE_gf_inv(gf in); + +void PQCLEAN_MCELIECE460896F_SSE_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece460896f/sse/operations.c b/crypto_kem/mceliece460896f/sse/operations.c new file mode 100644 index 00000000..efa4f68d --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE460896F_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE460896F_SSE_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE460896F_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE460896F_SSE_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE460896F_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE460896F_SSE_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE460896F_SSE_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE460896F_SSE_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE460896F_SSE_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE460896F_SSE_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE460896F_SSE_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE460896F_SSE_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE460896F_SSE_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896f/sse/params.h b/crypto_kem/mceliece460896f/sse/params.h new file mode 100644 index 00000000..55f1893d --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_PARAMS_H +#define PQCLEAN_MCELIECE460896F_SSE_PARAMS_H + +#define GFBITS 13 +#define SYS_N 4608 +#define SYS_T 96 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece460896f/sse/pk_gen.c b/crypto_kem/mceliece460896f/sse/pk_gen.c new file mode 100644 index 00000000..9c3ebc12 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/pk_gen.c @@ -0,0 +1,344 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec128 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 128 + 0 * 64 + r] <<= 1; + out[i * 128 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 128 + 1 * 64 + r] <<= 1; + out[i * 128 + 1 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec128 out0[][GFBITS], vec128 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[2] = {0}; + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(u[0], u[1]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(u[0], u[1]); + } + } +} + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 127) / 128) * 2 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = mat[ row + i ][ block_idx ]; + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = mat[ i + j ][ block_idx ]; + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx ] = buf[j]; + } + } + + return 0; +} + + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 127) / 128) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE460896F_SSE_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int tail = (GFBITS * SYS_T) % 64; + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 2 ]; + + uint64_t mask; + + vec128 irr_int[ GFBITS ]; + + vec128 consts[64][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + vec128 prod[ 64 ][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE460896F_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE460896F_SSE_fft(eval, irr_int); + + PQCLEAN_MCELIECE460896F_SSE_vec128_copy(prod[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896F_SSE_vec128_inv(tmp, prod[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896F_SSE_vec128_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE460896F_SSE_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE460896F_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + PQCLEAN_MCELIECE460896F_SSE_store_i(pk, mat[i][ NBLOCKS1_I - 1 ] >> tail, (64 - tail) / 8); + pk += (64 - tail) / 8; + + for (j = NBLOCKS1_I; j < NBLOCKS1_H; j++) { + PQCLEAN_MCELIECE460896F_SSE_store8(pk, mat[i][j]); + pk += 8; + } + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece460896f/sse/pk_gen.h b/crypto_kem/mceliece460896f/sse/pk_gen.h new file mode 100644 index 00000000..e239e02d --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_PK_GEN_H +#define PQCLEAN_MCELIECE460896F_SSE_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE460896F_SSE_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/sse/scalars_2x.inc b/crypto_kem/mceliece460896f/sse/scalars_2x.inc new file mode 100644 index 00000000..58789a47 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece460896f/sse/scalars_4x.inc b/crypto_kem/mceliece460896f/sse/scalars_4x.inc new file mode 100644 index 00000000..59918715 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/scalars_4x.inc @@ -0,0 +1,181 @@ +{{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF), +}}, +{{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FF00000000FF, 0X0000FF000000FFFF), +}}, +{{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), +}}, +{{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), +}} + diff --git a/crypto_kem/mceliece460896f/sse/sk_gen.c b/crypto_kem/mceliece460896f/sse/sk_gen.c new file mode 100644 index 00000000..7c1e0107 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE460896F_SSE_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE460896F_SSE_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE460896F_SSE_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE460896F_SSE_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE460896F_SSE_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE460896F_SSE_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE460896F_SSE_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE460896F_SSE_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896f/sse/sk_gen.h b/crypto_kem/mceliece460896f/sse/sk_gen.h new file mode 100644 index 00000000..03c6c0a8 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_SK_GEN_H +#define PQCLEAN_MCELIECE460896F_SSE_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE460896F_SSE_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE460896F_SSE_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/sse/syndrome_asm.S b/crypto_kem/mceliece460896f/sse/syndrome_asm.S new file mode 100644 index 00000000..ed72124a --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/syndrome_asm.S @@ -0,0 +1,960 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg128 pp + +# qhasm: reg128 ee + +# qhasm: reg128 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack128 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896F_SSE_syndrome_asm +.global PQCLEAN_MCELIECE460896F_SSE_syndrome_asm +_PQCLEAN_MCELIECE460896F_SSE_syndrome_asm: +PQCLEAN_MCELIECE460896F_SSE_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 523740 +# asm 1: add $523740,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1248 +# asm 1: mov $1248,>row=int64#5 +# asm 2: mov $1248,>row=%r8 +mov $1248,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rsi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 156 ] +# asm 1: movdqu 156(ee=reg128#2 +# asm 2: movdqu 156(ee=%xmm1 +movdqu 156(%rdx),%xmm1 + +# qhasm: ss &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 16(pp=%xmm1 +movdqu 16(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 172 ] +# asm 1: movdqu 172(ee=reg128#3 +# asm 2: movdqu 172(ee=%xmm2 +movdqu 172(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 32(pp=%xmm1 +movdqu 32(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 188 ] +# asm 1: movdqu 188(ee=reg128#3 +# asm 2: movdqu 188(ee=%xmm2 +movdqu 188(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 48(pp=%xmm1 +movdqu 48(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 204 ] +# asm 1: movdqu 204(ee=reg128#3 +# asm 2: movdqu 204(ee=%xmm2 +movdqu 204(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 64(pp=%xmm1 +movdqu 64(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 220 ] +# asm 1: movdqu 220(ee=reg128#3 +# asm 2: movdqu 220(ee=%xmm2 +movdqu 220(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 80(pp=%xmm1 +movdqu 80(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 236 ] +# asm 1: movdqu 236(ee=reg128#3 +# asm 2: movdqu 236(ee=%xmm2 +movdqu 236(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 96(pp=%xmm1 +movdqu 96(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 252 ] +# asm 1: movdqu 252(ee=reg128#3 +# asm 2: movdqu 252(ee=%xmm2 +movdqu 252(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 112(pp=%xmm1 +movdqu 112(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 268 ] +# asm 1: movdqu 268(ee=reg128#3 +# asm 2: movdqu 268(ee=%xmm2 +movdqu 268(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 128(pp=%xmm1 +movdqu 128(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 284 ] +# asm 1: movdqu 284(ee=reg128#3 +# asm 2: movdqu 284(ee=%xmm2 +movdqu 284(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 144(pp=%xmm1 +movdqu 144(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 300 ] +# asm 1: movdqu 300(ee=reg128#3 +# asm 2: movdqu 300(ee=%xmm2 +movdqu 300(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 160(pp=%xmm1 +movdqu 160(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 316 ] +# asm 1: movdqu 316(ee=reg128#3 +# asm 2: movdqu 316(ee=%xmm2 +movdqu 316(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 176(pp=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 332 ] +# asm 1: movdqu 332(ee=reg128#3 +# asm 2: movdqu 332(ee=%xmm2 +movdqu 332(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 192(pp=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 348 ] +# asm 1: movdqu 348(ee=reg128#3 +# asm 2: movdqu 348(ee=%xmm2 +movdqu 348(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 208(pp=%xmm1 +movdqu 208(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 364 ] +# asm 1: movdqu 364(ee=reg128#3 +# asm 2: movdqu 364(ee=%xmm2 +movdqu 364(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 224(pp=%xmm1 +movdqu 224(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 380 ] +# asm 1: movdqu 380(ee=reg128#3 +# asm 2: movdqu 380(ee=%xmm2 +movdqu 380(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 240(pp=%xmm1 +movdqu 240(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 396 ] +# asm 1: movdqu 396(ee=reg128#3 +# asm 2: movdqu 396(ee=%xmm2 +movdqu 396(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 256(pp=%xmm1 +movdqu 256(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 412 ] +# asm 1: movdqu 412(ee=reg128#3 +# asm 2: movdqu 412(ee=%xmm2 +movdqu 412(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 272(pp=%xmm1 +movdqu 272(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 428 ] +# asm 1: movdqu 428(ee=reg128#3 +# asm 2: movdqu 428(ee=%xmm2 +movdqu 428(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 288(pp=%xmm1 +movdqu 288(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 444 ] +# asm 1: movdqu 444(ee=reg128#3 +# asm 2: movdqu 444(ee=%xmm2 +movdqu 444(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 304(pp=%xmm1 +movdqu 304(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 460 ] +# asm 1: movdqu 460(ee=reg128#3 +# asm 2: movdqu 460(ee=%xmm2 +movdqu 460(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 320(pp=%xmm1 +movdqu 320(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 476 ] +# asm 1: movdqu 476(ee=reg128#3 +# asm 2: movdqu 476(ee=%xmm2 +movdqu 476(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 336(pp=%xmm1 +movdqu 336(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 492 ] +# asm 1: movdqu 492(ee=reg128#3 +# asm 2: movdqu 492(ee=%xmm2 +movdqu 492(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 352(pp=%xmm1 +movdqu 352(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 508 ] +# asm 1: movdqu 508(ee=reg128#3 +# asm 2: movdqu 508(ee=%xmm2 +movdqu 508(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 368(pp=%xmm1 +movdqu 368(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 524 ] +# asm 1: movdqu 524(ee=reg128#3 +# asm 2: movdqu 524(ee=%xmm2 +movdqu 524(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 384(pp=%xmm1 +movdqu 384(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 540 ] +# asm 1: movdqu 540(ee=reg128#3 +# asm 2: movdqu 540(ee=%xmm2 +movdqu 540(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 400(pp=%xmm1 +movdqu 400(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 556 ] +# asm 1: movdqu 556(ee=reg128#3 +# asm 2: movdqu 556(ee=%xmm2 +movdqu 556(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand buf=stack128#1 +# asm 2: movdqa buf=0(%rsp) +movdqa %xmm0,0(%rsp) + +# qhasm: s = *(uint32 *)(input_1 + 416) +# asm 1: movl 416(s=int64#6d +# asm 2: movl 416(s=%r9d +movl 416(%rsi),%r9d + +# qhasm: e = *(uint32 *)(input_2 + 572) +# asm 1: movl 572(e=int64#7d +# asm 2: movl 572(e=%eax +movl 572(%rdx),%eax + +# qhasm: s &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(ee=reg128#2 +# asm 2: movdqu 0(ee=%xmm1 +movdqu 0(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 16(ss=%xmm0 +movdqu 16(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 16 ] +# asm 1: movdqu 16(ee=reg128#2 +# asm 2: movdqu 16(ee=%xmm1 +movdqu 16(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 32(ss=%xmm0 +movdqu 32(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 32 ] +# asm 1: movdqu 32(ee=reg128#2 +# asm 2: movdqu 32(ee=%xmm1 +movdqu 32(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 48(ss=%xmm0 +movdqu 48(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 48 ] +# asm 1: movdqu 48(ee=reg128#2 +# asm 2: movdqu 48(ee=%xmm1 +movdqu 48(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 64(ss=%xmm0 +movdqu 64(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 64 ] +# asm 1: movdqu 64(ee=reg128#2 +# asm 2: movdqu 64(ee=%xmm1 +movdqu 64(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 80(ss=%xmm0 +movdqu 80(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 80 ] +# asm 1: movdqu 80(ee=reg128#2 +# asm 2: movdqu 80(ee=%xmm1 +movdqu 80(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 96(ss=%xmm0 +movdqu 96(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 96 ] +# asm 1: movdqu 96(ee=reg128#2 +# asm 2: movdqu 96(ee=%xmm1 +movdqu 96(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 112(ss=%xmm0 +movdqu 112(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 112 ] +# asm 1: movdqu 112(ee=reg128#2 +# asm 2: movdqu 112(ee=%xmm1 +movdqu 112(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor s=int64#2 +# asm 2: movq 128(s=%rsi +movq 128(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 128 ] +# asm 1: movq 128(e=int64#4 +# asm 2: movq 128(e=%rcx +movq 128(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 136(s=%rsi +movq 136(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 136 ] +# asm 1: movq 136(e=int64#4 +# asm 2: movq 136(e=%rcx +movq 136(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 144(s=%rsi +movq 144(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 144 ] +# asm 1: movq 144(e=int64#4 +# asm 2: movq 144(e=%rcx +movq 144(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2d +# asm 2: movl 152(s=%esi +movl 152(%rdi),%esi + +# qhasm: e = *(uint32 *)( input_2 + 152 ) +# asm 1: movl 152(e=int64#3d +# asm 2: movl 152(e=%edx +movl 152(%rdx),%edx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE460896F_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE460896F_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE460896F_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE460896F_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE460896F_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE460896F_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE460896F_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE460896F_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE460896F_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE460896F_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE460896F_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE460896F_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE460896F_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE460896F_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE460896F_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE460896F_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE460896F_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE460896F_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE460896F_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE460896F_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE460896F_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE460896F_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#3 +# asm 2: movq 0(s0=%rdx +movq 0(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#4 +# asm 2: movq 8(s1=%rcx +movq 8(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 16(s0=%rdx +movq 16(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(s1=int64#4 +# asm 2: movq 24(s1=%rcx +movq 24(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 32(s0=%rdx +movq 32(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(s1=int64#4 +# asm 2: movq 40(s1=%rcx +movq 40(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 48(s0=%rdx +movq 48(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(s1=int64#4 +# asm 2: movq 56(s1=%rcx +movq 56(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 64(s0=%rdx +movq 64(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(s1=int64#4 +# asm 2: movq 72(s1=%rcx +movq 72(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 80(s0=%rdx +movq 80(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(s1=int64#4 +# asm 2: movq 88(s1=%rcx +movq 88(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 96(s0=%rdx +movq 96(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(s1=int64#4 +# asm 2: movq 104(s1=%rcx +movq 104(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 112(s0=%rdx +movq 112(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(s1=int64#4 +# asm 2: movq 120(s1=%rcx +movq 120(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 128(s0=%rdx +movq 128(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(s1=int64#4 +# asm 2: movq 136(s1=%rcx +movq 136(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 144(s0=%rdx +movq 144(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(s1=int64#4 +# asm 2: movq 152(s1=%rcx +movq 152(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 160(s0=%rdx +movq 160(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(s1=int64#4 +# asm 2: movq 168(s1=%rcx +movq 168(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 176(s0=%rdx +movq 176(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(s1=int64#4 +# asm 2: movq 184(s1=%rcx +movq 184(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 192(s0=%rdx +movq 192(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(s1=int64#4 +# asm 2: movq 200(s1=%rcx +movq 200(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE460896F_SSE_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE460896F_SSE_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE460896F_SSE_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE460896F_SSE_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE460896F_SSE_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE460896F_SSE_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE460896F_SSE_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE460896F_SSE_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE460896F_SSE_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE460896F_SSE_vec128_set2x( PQCLEAN_MCELIECE460896F_SSE_load8(in), PQCLEAN_MCELIECE460896F_SSE_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE460896F_SSE_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE460896F_SSE_store8(out + 0, PQCLEAN_MCELIECE460896F_SSE_vec128_extract(in, 0)); + PQCLEAN_MCELIECE460896F_SSE_store8(out + 8, PQCLEAN_MCELIECE460896F_SSE_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece460896f/sse/util.h b/crypto_kem/mceliece460896f/sse/util.h new file mode 100644 index 00000000..8abac405 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_UTIL_H +#define PQCLEAN_MCELIECE460896F_SSE_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE460896F_SSE_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE460896F_SSE_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE460896F_SSE_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE460896F_SSE_load4(const unsigned char *src); +void PQCLEAN_MCELIECE460896F_SSE_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE460896F_SSE_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE460896F_SSE_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE460896F_SSE_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE460896F_SSE_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece460896f/sse/vec128.c b/crypto_kem/mceliece460896f/sse/vec128.c new file mode 100644 index 00000000..3fa07585 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/vec128.c @@ -0,0 +1,152 @@ +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + +#include "vec128.h" + +#include "params.h" + + +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE460896F_SSE_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE460896F_SSE_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE460896F_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE460896F_SSE_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896F_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE460896F_SSE_vec128_mul_asm(h, f, g, 16); +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE460896F_SSE_vec128_sq(vec128 *out, const vec128 *in) { + int i; + vec128 result[GFBITS], t; + + t = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[11], in[12]); + + result[0] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[0], in[11]); + result[1] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[7], t); + result[2] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[1], in[7]); + result[3] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[8], t); + result[4] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[2], in[7]); + result[4] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(result[4], in[8]); + result[4] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(result[4], t); + result[5] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[7], in[9]); + result[6] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[3], in[8]); + result[6] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(result[6], in[9]); + result[6] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(result[6], in[12]); + result[7] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[8], in[10]); + result[8] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[4], in[9]); + result[8] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(result[8], in[10]); + result[9] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[9], in[11]); + result[10] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[5], in[10]); + result[10] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(result[10], in[11]); + result[11] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[10], in[12]); + result[12] = PQCLEAN_MCELIECE460896F_SSE_vec128_xor(in[6], t); + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE460896F_SSE_vec128_inv(vec128 *out, const vec128 *in) { + vec128 tmp_11[ GFBITS ]; + vec128 tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE460896F_SSE_vec128_copy(out, in); + + PQCLEAN_MCELIECE460896F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE460896F_SSE_vec128_sq(out, tmp_11); + PQCLEAN_MCELIECE460896F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE460896F_SSE_vec128_sq(out, tmp_1111); + PQCLEAN_MCELIECE460896F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE460896F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE460896F_SSE_vec128_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE460896F_SSE_vec128_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece460896f/sse/vec128.h b/crypto_kem/mceliece460896f/sse/vec128.h new file mode 100644 index 00000000..cf020836 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/vec128.h @@ -0,0 +1,43 @@ +#ifndef PQCLEAN_MCELIECE460896F_SSE_VEC128_H +#define PQCLEAN_MCELIECE460896F_SSE_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE460896F_SSE_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE460896F_SSE_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE460896F_SSE_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE460896F_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE460896F_SSE_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE460896F_SSE_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896F_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +void PQCLEAN_MCELIECE460896F_SSE_vec128_sq(vec128 *out, const vec128 *in); +void PQCLEAN_MCELIECE460896F_SSE_vec128_inv(vec128 *out, const vec128 *in); +#endif diff --git a/crypto_kem/mceliece460896f/sse/vec128_mul_asm.S b/crypto_kem/mceliece460896f/sse/vec128_mul_asm.S new file mode 100644 index 00000000..0c0afcb3 --- /dev/null +++ b/crypto_kem/mceliece460896f/sse/vec128_mul_asm.S @@ -0,0 +1,2127 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 a0 + +# qhasm: reg128 a1 + +# qhasm: reg128 a2 + +# qhasm: reg128 a3 + +# qhasm: reg128 a4 + +# qhasm: reg128 a5 + +# qhasm: reg128 a6 + +# qhasm: reg128 a7 + +# qhasm: reg128 a8 + +# qhasm: reg128 a9 + +# qhasm: reg128 a10 + +# qhasm: reg128 a11 + +# qhasm: reg128 a12 + +# qhasm: reg128 b0 + +# qhasm: reg128 b1 + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 r5 + +# qhasm: reg128 r6 + +# qhasm: reg128 r7 + +# qhasm: reg128 r8 + +# qhasm: reg128 r9 + +# qhasm: reg128 r10 + +# qhasm: reg128 r11 + +# qhasm: reg128 r12 + +# qhasm: reg128 r13 + +# qhasm: reg128 r14 + +# qhasm: reg128 r15 + +# qhasm: reg128 r16 + +# qhasm: reg128 r17 + +# qhasm: reg128 r18 + +# qhasm: reg128 r19 + +# qhasm: reg128 r20 + +# qhasm: reg128 r21 + +# qhasm: reg128 r22 + +# qhasm: reg128 r23 + +# qhasm: reg128 r24 + +# qhasm: reg128 r + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896F_SSE_vec128_mul_asm +.global PQCLEAN_MCELIECE460896F_SSE_vec128_mul_asm +_PQCLEAN_MCELIECE460896F_SSE_vec128_mul_asm: +PQCLEAN_MCELIECE460896F_SSE_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(b0=reg128#1 +# asm 2: movdqu 0(b0=%xmm0 +movdqu 0(%rdx),%xmm0 + +# qhasm: a12 = mem128[ input_1 + 192 ] +# asm 1: movdqu 192(a12=reg128#2 +# asm 2: movdqu 192(a12=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg128#3 +# asm 2: vpand r12=%xmm2 +vpand %xmm0,%xmm1,%xmm2 + +# qhasm: r13 = a12 & mem128[input_2 + 16] +# asm 1: vpand 16(r13=reg128#4 +# asm 2: vpand 16(r13=%xmm3 +vpand 16(%rdx),%xmm1,%xmm3 + +# qhasm: r14 = a12 & mem128[input_2 + 32] +# asm 1: vpand 32(r14=reg128#5 +# asm 2: vpand 32(r14=%xmm4 +vpand 32(%rdx),%xmm1,%xmm4 + +# qhasm: r15 = a12 & mem128[input_2 + 48] +# asm 1: vpand 48(r15=reg128#6 +# asm 2: vpand 48(r15=%xmm5 +vpand 48(%rdx),%xmm1,%xmm5 + +# qhasm: r16 = a12 & mem128[input_2 + 64] +# asm 1: vpand 64(r16=reg128#7 +# asm 2: vpand 64(r16=%xmm6 +vpand 64(%rdx),%xmm1,%xmm6 + +# qhasm: r17 = a12 & mem128[input_2 + 80] +# asm 1: vpand 80(r17=reg128#8 +# asm 2: vpand 80(r17=%xmm7 +vpand 80(%rdx),%xmm1,%xmm7 + +# qhasm: r18 = a12 & mem128[input_2 + 96] +# asm 1: vpand 96(r18=reg128#9 +# asm 2: vpand 96(r18=%xmm8 +vpand 96(%rdx),%xmm1,%xmm8 + +# qhasm: r19 = a12 & mem128[input_2 + 112] +# asm 1: vpand 112(r19=reg128#10 +# asm 2: vpand 112(r19=%xmm9 +vpand 112(%rdx),%xmm1,%xmm9 + +# qhasm: r20 = a12 & mem128[input_2 + 128] +# asm 1: vpand 128(r20=reg128#11 +# asm 2: vpand 128(r20=%xmm10 +vpand 128(%rdx),%xmm1,%xmm10 + +# qhasm: r21 = a12 & mem128[input_2 + 144] +# asm 1: vpand 144(r21=reg128#12 +# asm 2: vpand 144(r21=%xmm11 +vpand 144(%rdx),%xmm1,%xmm11 + +# qhasm: r22 = a12 & mem128[input_2 + 160] +# asm 1: vpand 160(r22=reg128#13 +# asm 2: vpand 160(r22=%xmm12 +vpand 160(%rdx),%xmm1,%xmm12 + +# qhasm: r23 = a12 & mem128[input_2 + 176] +# asm 1: vpand 176(r23=reg128#14 +# asm 2: vpand 176(r23=%xmm13 +vpand 176(%rdx),%xmm1,%xmm13 + +# qhasm: r24 = a12 & mem128[input_2 + 192] +# asm 1: vpand 192(r24=reg128#2 +# asm 2: vpand 192(r24=%xmm1 +vpand 192(%rdx),%xmm1,%xmm1 + +# qhasm: r15 ^= r24 +# asm 1: pxor r11=reg128#2 +# asm 2: movdqa r11=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: a11 = mem128[ input_1 + 176 ] +# asm 1: movdqu 176(a11=reg128#15 +# asm 2: movdqu 176(a11=%xmm14 +movdqu 176(%rsi),%xmm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r22 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r23 ^= r +# asm 1: pxor r10=reg128#14 +# asm 2: movdqa r10=%xmm13 +movdqa %xmm13,%xmm13 + +# qhasm: a10 = mem128[ input_1 + 160 ] +# asm 1: movdqu 160(a10=reg128#15 +# asm 2: movdqu 160(a10=%xmm14 +movdqu 160(%rsi),%xmm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r22 ^= r +# asm 1: pxor r9=reg128#13 +# asm 2: movdqa r9=%xmm12 +movdqa %xmm12,%xmm12 + +# qhasm: a9 = mem128[ input_1 + 144 ] +# asm 1: movdqu 144(a9=reg128#15 +# asm 2: movdqu 144(a9=%xmm14 +movdqu 144(%rsi),%xmm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r21 ^= r +# asm 1: pxor r8=reg128#12 +# asm 2: movdqa r8=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: a8 = mem128[ input_1 + 128 ] +# asm 1: movdqu 128(a8=reg128#15 +# asm 2: movdqu 128(a8=%xmm14 +movdqu 128(%rsi),%xmm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r20 ^= r +# asm 1: pxor r7=reg128#11 +# asm 2: movdqa r7=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: a7 = mem128[ input_1 + 112 ] +# asm 1: movdqu 112(a7=reg128#15 +# asm 2: movdqu 112(a7=%xmm14 +movdqu 112(%rsi),%xmm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r6=reg128#10 +# asm 2: movdqa r6=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: a6 = mem128[ input_1 + 96 ] +# asm 1: movdqu 96(a6=reg128#15 +# asm 2: movdqu 96(a6=%xmm14 +movdqu 96(%rsi),%xmm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r5=reg128#9 +# asm 2: movdqa r5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: a5 = mem128[ input_1 + 80 ] +# asm 1: movdqu 80(a5=reg128#15 +# asm 2: movdqu 80(a5=%xmm14 +movdqu 80(%rsi),%xmm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r4=reg128#8 +# asm 2: movdqa r4=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: a4 = mem128[ input_1 + 64 ] +# asm 1: movdqu 64(a4=reg128#15 +# asm 2: movdqu 64(a4=%xmm14 +movdqu 64(%rsi),%xmm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r3=reg128#7 +# asm 2: movdqa r3=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: a3 = mem128[ input_1 + 48 ] +# asm 1: movdqu 48(a3=reg128#15 +# asm 2: movdqu 48(a3=%xmm14 +movdqu 48(%rsi),%xmm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r2=reg128#6 +# asm 2: movdqa r2=%xmm5 +movdqa %xmm5,%xmm5 + +# qhasm: a2 = mem128[ input_1 + 32 ] +# asm 1: movdqu 32(a2=reg128#15 +# asm 2: movdqu 32(a2=%xmm14 +movdqu 32(%rsi),%xmm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r1=reg128#5 +# asm 2: movdqa r1=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: a1 = mem128[ input_1 + 16 ] +# asm 1: movdqu 16(a1=reg128#15 +# asm 2: movdqu 16(a1=%xmm14 +movdqu 16(%rsi),%xmm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r0=reg128#4 +# asm 2: movdqa r0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: a0 = mem128[ input_1 + 0 ] +# asm 1: movdqu 0(a0=reg128#15 +# asm 2: movdqu 0(a0=%xmm14 +movdqu 0(%rsi),%xmm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm0,%xmm14,%xmm0 + +# qhasm: r0 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 16(r=%xmm0 +vpand 16(%rdx),%xmm14,%xmm0 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 32(r=%xmm0 +vpand 32(%rdx),%xmm14,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 48(r=%xmm0 +vpand 48(%rdx),%xmm14,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 64(r=%xmm0 +vpand 64(%rdx),%xmm14,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 80(r=%xmm0 +vpand 80(%rdx),%xmm14,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 96(r=%xmm0 +vpand 96(%rdx),%xmm14,%xmm0 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 112(r=%xmm0 +vpand 112(%rdx),%xmm14,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 128(r=%xmm0 +vpand 128(%rdx),%xmm14,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 144(r=%xmm0 +vpand 144(%rdx),%xmm14,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 160(r=%xmm0 +vpand 160(%rdx),%xmm14,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 176(r=%xmm0 +vpand 176(%rdx),%xmm14,%xmm0 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 192(r=%xmm0 +vpand 192(%rdx),%xmm14,%xmm0 + +# qhasm: r12 ^= r +# asm 1: pxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE460896F_VEC_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece460896f/vec/api.h b/crypto_kem/mceliece460896f/vec/api.h new file mode 100644 index 00000000..e542e78d --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_API_H +#define PQCLEAN_MCELIECE460896F_VEC_API_H + +#include + +#define PQCLEAN_MCELIECE460896F_VEC_CRYPTO_ALGNAME "Classic McEliece 460896f" +#define PQCLEAN_MCELIECE460896F_VEC_CRYPTO_PUBLICKEYBYTES 524160 +#define PQCLEAN_MCELIECE460896F_VEC_CRYPTO_SECRETKEYBYTES 13568 +#define PQCLEAN_MCELIECE460896F_VEC_CRYPTO_CIPHERTEXTBYTES 188 +#define PQCLEAN_MCELIECE460896F_VEC_CRYPTO_BYTES 32 + +int PQCLEAN_MCELIECE460896F_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE460896F_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE460896F_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/benes.c b/crypto_kem/mceliece460896f/vec/benes.c new file mode 100644 index 00000000..30f9cdfd --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/benes.c @@ -0,0 +1,147 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" +#include "vec.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE460896F_VEC_benes(vec *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = r[i * 2 + 0]; + r_int_v[1][i] = r[i * 2 + 1]; + } + + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE460896F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + r[i * 2 + 0] = r_int_v[0][i]; + r[i * 2 + 1] = r_int_v[1][i]; + } +} + diff --git a/crypto_kem/mceliece460896f/vec/benes.h b/crypto_kem/mceliece460896f/vec/benes.h new file mode 100644 index 00000000..e7b11fef --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/benes.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_BENES_H +#define PQCLEAN_MCELIECE460896F_VEC_BENES_H +/* + This file is for Benes network related functions +*/ + +#include + +void PQCLEAN_MCELIECE460896F_VEC_benes(uint64_t * /*r*/, const unsigned char * /*bits*/, int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/bm.c b/crypto_kem/mceliece460896f/vec/bm.c new file mode 100644 index 00000000..fa3a22e9 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/bm.c @@ -0,0 +1,238 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline void vec_cmov(vec *out, const vec *in, uint16_t mask) { + int i; + + vec m0, m1; + + m0 = PQCLEAN_MCELIECE460896F_VEC_vec_set1_16b(mask); + m1 = ~m0; + + for (i = 0; i < GFBITS; i++) { + out[i] = (in[i] & m0) | (out[i] & m1); + out[i] = (in[i] & m0) | (out[i] & m1); + } +} + +static inline void interleave(vec *in, int idx0, int idx1, const vec *mask, int b) { + int s = 1 << b; + + vec x, y; + + x = (in[idx0] & mask[0]) | ((in[idx1] & mask[0]) << s); + y = ((in[idx0] & mask[1]) >> s) | (in[idx1] & mask[1]); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, const vec *in) { + int i, k; + + vec mask[4][2]; + vec buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = 0; + } + + mask[0][0] = PQCLEAN_MCELIECE460896F_VEC_vec_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE460896F_VEC_vec_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE460896F_VEC_vec_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE460896F_VEC_vec_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE460896F_VEC_vec_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE460896F_VEC_vec_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE460896F_VEC_vec_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE460896F_VEC_vec_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ k * 16 + i ] = (buf[i] >> (k * 16)) & GFMASK; + } + } +} + +static void update(vec in[][GFBITS], const gf e) { + int i; + vec tmp; + + for (i = 0; i < GFBITS; i++) { + tmp = (e >> i) & 1; + + in[0][i] = (in[0][i] >> 1) | (in[1][i] << 63); + in[1][i] = (in[1][i] >> 1) | (tmp << 63); + } +} + +static inline gf vec_reduce(vec in[][GFBITS]) { + int i; + vec tmp; + gf ret = 0; + + for (i = GFBITS - 1; i >= 0; i--) { + tmp = in[0][i] ^ in[1][i]; + + tmp ^= tmp >> 32; + tmp ^= tmp >> 16; + tmp ^= tmp >> 8; + tmp ^= tmp >> 4; + tmp ^= tmp >> 2; + tmp ^= tmp >> 1; + + ret <<= 1; + ret |= tmp & 1; + } + + return ret; +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE460896F_VEC_bm(vec out[][GFBITS], vec in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + + vec prod[2][GFBITS]; + vec interval[2][GFBITS]; + vec dd[2][GFBITS], bb[2][GFBITS]; + vec B[2][GFBITS], C[2][GFBITS]; + vec B_tmp[2][GFBITS], C_tmp[2][GFBITS]; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[ 64], in[1]); + get_coefs(&coefs[128], in[2]); + get_coefs(&coefs[192], in[3]); + + C[0][0] = 0; + C[1][0] = one << 63; + B[0][0] = 0; + B[1][0] = one << 62; + + for (i = 1; i < GFBITS; i++) { + C[0][i] = C[1][i] = B[0][i] = B[1][i] = 0; + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[0][i] = interval[1][i] = 0; + } + + for (N = 0; N < SYS_T * 2; N++) { + update(interval, coefs[N]); + + PQCLEAN_MCELIECE460896F_VEC_vec_mul(prod[0], C[0], interval[0]); + PQCLEAN_MCELIECE460896F_VEC_vec_mul(prod[1], C[1], interval[1]); + + d = vec_reduce(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[0][i] = dd[1][i] = PQCLEAN_MCELIECE460896F_VEC_vec_setbits((d >> i) & 1); + bb[0][i] = bb[1][i] = PQCLEAN_MCELIECE460896F_VEC_vec_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE460896F_VEC_vec_mul(B_tmp[0], dd[0], B[0]); + PQCLEAN_MCELIECE460896F_VEC_vec_mul(B_tmp[1], dd[1], B[1]); + PQCLEAN_MCELIECE460896F_VEC_vec_mul(C_tmp[0], bb[0], C[0]); + PQCLEAN_MCELIECE460896F_VEC_vec_mul(C_tmp[1], bb[1], C[1]); + + vec_cmov(B[0], C[0], mask); + vec_cmov(B[1], C[1], mask); + update(B, 0); + + for (i = 0; i < GFBITS; i++) { + C[0][i] = B_tmp[0][i] ^ C_tmp[0][i]; + C[1][i] = B_tmp[1][i] ^ C_tmp[1][i]; + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + out[0][i] = (C[0][i] >> 31) | (C[1][i] << 33); + out[1][i] = C[1][i] >> 31; + } +} + diff --git a/crypto_kem/mceliece460896f/vec/bm.h b/crypto_kem/mceliece460896f/vec/bm.h new file mode 100644 index 00000000..cef81820 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/bm.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_BM_H +#define PQCLEAN_MCELIECE460896F_VEC_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE460896F_VEC_bm(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/consts.inc b/crypto_kem/mceliece460896f/vec/consts.inc new file mode 100644 index 00000000..1875ca4d --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/consts.inc @@ -0,0 +1,1920 @@ +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x9999999966666666, + 0x3C3CC3C3C3C33C3C, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xCC33CC3333CC33CC, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0x00FFFF0000FFFF00 +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x6666666699999999, + 0xC3C33C3C3C3CC3C3, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x33CC33CCCC33CC33, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0xFF0000FFFF0000FF +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x9669966969966996, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x6996699696699669, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x6996699696699669, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x9669966969966996, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, diff --git a/crypto_kem/mceliece460896f/vec/controlbits.c b/crypto_kem/mceliece460896f/vec/controlbits.c new file mode 100644 index 00000000..c97970de --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE460896F_VEC_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE460896F_VEC_sort_63b(n / 2, x); + PQCLEAN_MCELIECE460896F_VEC_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE460896F_VEC_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece460896f/vec/controlbits.h b/crypto_kem/mceliece460896f/vec/controlbits.h new file mode 100644 index 00000000..df33325e --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_CONTROLBITS_H +#define PQCLEAN_MCELIECE460896F_VEC_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE460896F_VEC_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE460896F_VEC_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/crypto_hash.h b/crypto_kem/mceliece460896f/vec/crypto_hash.h new file mode 100644 index 00000000..b56017df --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE460896F_VEC_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece460896f/vec/decrypt.c b/crypto_kem/mceliece460896f/vec/decrypt.c new file mode 100644 index 00000000..67d45c40 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/decrypt.c @@ -0,0 +1,191 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" +#include "vec.h" + +#include + +static void scaling(vec out[][GFBITS], vec inv[][GFBITS], const unsigned char *sk, const vec *recv) { + int i, j; + + vec irr_int[2][ GFBITS ]; + vec eval[128][ GFBITS ]; + vec tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE460896F_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE460896F_VEC_fft(eval, irr_int); + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE460896F_VEC_vec_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE460896F_VEC_vec_copy(inv[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE460896F_VEC_vec_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896F_VEC_vec_inv(tmp, inv[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE460896F_VEC_vec_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE460896F_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896F_VEC_vec_copy(inv[0], tmp); + + // + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static void preprocess(vec *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 128; i++) { + recv[i] = PQCLEAN_MCELIECE460896F_VEC_load8(r + i * 8); + } +} + +static void postprocess(unsigned char *e, vec *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE460896F_VEC_store8(error8 + i * 8, err[i]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec out[][GFBITS], vec inv[][GFBITS], const vec *recv) { + int i, j; + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static int weight_check(const unsigned char *e, const vec *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < (1 << GFBITS); i++) { + w0 += (error[i / 64] >> (i % 64)) & 1; + } + + for (i = 0; i < SYS_N; i++) { + w1 += (e[i / 8] >> (i % 8)) & 1; + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec s0[][ GFBITS ], vec s1[][ GFBITS ]) { + int i, j; + vec diff = 0; + + for (i = 0; i < 4; i++) { + for (j = 0; j < GFBITS; j++) { + diff |= (s0[i][j] ^ s1[i][j]); + } + } + + return (uint16_t)PQCLEAN_MCELIECE460896F_VEC_vec_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE460896F_VEC_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec inv[ 128 ][ GFBITS ]; + vec scaled[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + + vec error[ 128 ]; + + vec s_priv[ 4 ][ GFBITS ]; + vec s_priv_cmp[ 4 ][ GFBITS ]; + vec locator[2][ GFBITS ]; + + vec recv[ 128 ]; + vec allone; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE460896F_VEC_benes(recv, sk + IRR_BYTES, 1); + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE460896F_VEC_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE460896F_VEC_bm(locator, s_priv); + + PQCLEAN_MCELIECE460896F_VEC_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE460896F_VEC_vec_setbits(1); + + for (i = 0; i < 128; i++) { + error[i] = PQCLEAN_MCELIECE460896F_VEC_vec_or_reduce(eval[i]); + error[i] ^= allone; + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE460896F_VEC_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE460896F_VEC_benes(error, sk + IRR_BYTES, 0); + + postprocess(e, error); + + check_weight = (uint16_t)weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece460896f/vec/decrypt.h b/crypto_kem/mceliece460896f/vec/decrypt.h new file mode 100644 index 00000000..23e971fa --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_DECRYPT_H +#define PQCLEAN_MCELIECE460896F_VEC_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE460896F_VEC_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/encrypt.c b/crypto_kem/mceliece460896f/vec/encrypt.c new file mode 100644 index 00000000..4a268641 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/encrypt.c @@ -0,0 +1,138 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind[ SYS_T * 2 ]; + uint8_t *ind8 = (uint8_t *)ind; + uint32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes(ind8, sizeof(ind)); + for (i = 0; i < sizeof(ind); i += 2) { + ind[i / 2] = (uint16_t)ind8[i + 1] << 8 | ind8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind32[i] == ind32[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + PQCLEAN_MCELIECE460896F_VEC_store8(e, e_int[i]); + e += 8; + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + uint64_t b; + + const uint8_t *e_ptr8 = e + SYND_BYTES; + const uint8_t *pk_ptr8; + + int i, j; + + // + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = e[i]; + } + + for (i = 0; i < PK_NROWS; i++) { + pk_ptr8 = pk + PK_ROW_BYTES * i; + + b = 0; + for (j = 0; j < PK_NCOLS / 64; j++) { + b ^= PQCLEAN_MCELIECE460896F_VEC_load8(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE460896F_VEC_load8(e_ptr8 + 8 * j); + } + + b ^= PQCLEAN_MCELIECE460896F_VEC_load4(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE460896F_VEC_load4(e_ptr8 + 8 * j); + + b ^= b >> 32; + b ^= b >> 16; + b ^= b >> 8; + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] ^= (b << (i % 8)); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE460896F_VEC_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece460896f/vec/encrypt.h b/crypto_kem/mceliece460896f/vec/encrypt.h new file mode 100644 index 00000000..bef8e1c0 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_ENCRYPT_H +#define PQCLEAN_MCELIECE460896F_VEC_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE460896F_VEC_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/fft.c b/crypto_kem/mceliece460896f/vec/fft.c new file mode 100644 index 00000000..2469c8e4 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/fft.c @@ -0,0 +1,269 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec in[][GFBITS]) { + int i, j, k; + + const vec mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const vec s[5][2][GFBITS] = { +#include "scalars_2x.inc" + }; + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[1][i] >> 32; + in[0][i] ^= in[1][i] << 32; + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[0][i] ^= (in[0][i] & mask[k][0]) >> (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) >> (1 << k); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE460896F_VEC_vec_mul(in[0], in[0], s[j][0]); + PQCLEAN_MCELIECE460896F_VEC_vec_mul(in[1], in[1], s[j][1]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[8][ GFBITS ]; + vec buf[128]; + + uint64_t consts_ptr = 2; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[7] = {2522, 7827, 7801, 8035, 6897, 8167, 3476}; + + // + + for (i = 0; i < 7; i++) { + for (j = 0; j < GFBITS; j++) { + pre[i][j] = (beta[i] >> j) & 1; + pre[i][j] = -pre[i][j]; + } + + PQCLEAN_MCELIECE460896F_VEC_vec_mul(pre[i], in[1], pre[i]); + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = in[0][i]; + + buf[1] = buf[0] ^ pre[0][i]; + buf[32] = in[0][i] ^ pre[5][i]; + buf[3] = buf[1] ^ pre[1][i]; + buf[96] = buf[32] ^ pre[6][i]; + buf[97] = buf[96] ^ pre[0][i]; + buf[2] = in[0][i] ^ pre[1][i]; + buf[99] = buf[97] ^ pre[1][i]; + buf[6] = buf[2] ^ pre[2][i]; + buf[98] = buf[99] ^ pre[0][i]; + buf[7] = buf[6] ^ pre[0][i]; + buf[102] = buf[98] ^ pre[2][i]; + buf[5] = buf[7] ^ pre[1][i]; + buf[103] = buf[102] ^ pre[0][i]; + buf[101] = buf[103] ^ pre[1][i]; + buf[4] = in[0][i] ^ pre[2][i]; + buf[100] = buf[101] ^ pre[0][i]; + buf[12] = buf[4] ^ pre[3][i]; + buf[108] = buf[100] ^ pre[3][i]; + buf[13] = buf[12] ^ pre[0][i]; + buf[109] = buf[108] ^ pre[0][i]; + buf[15] = buf[13] ^ pre[1][i]; + buf[111] = buf[109] ^ pre[1][i]; + buf[14] = buf[15] ^ pre[0][i]; + buf[110] = buf[111] ^ pre[0][i]; + buf[10] = buf[14] ^ pre[2][i]; + buf[106] = buf[110] ^ pre[2][i]; + buf[11] = buf[10] ^ pre[0][i]; + buf[107] = buf[106] ^ pre[0][i]; + buf[9] = buf[11] ^ pre[1][i]; + buf[105] = buf[107] ^ pre[1][i]; + buf[104] = buf[105] ^ pre[0][i]; + buf[8] = in[0][i] ^ pre[3][i]; + buf[120] = buf[104] ^ pre[4][i]; + buf[24] = buf[8] ^ pre[4][i]; + buf[121] = buf[120] ^ pre[0][i]; + buf[25] = buf[24] ^ pre[0][i]; + buf[123] = buf[121] ^ pre[1][i]; + buf[27] = buf[25] ^ pre[1][i]; + buf[122] = buf[123] ^ pre[0][i]; + buf[26] = buf[27] ^ pre[0][i]; + buf[126] = buf[122] ^ pre[2][i]; + buf[30] = buf[26] ^ pre[2][i]; + buf[127] = buf[126] ^ pre[0][i]; + buf[31] = buf[30] ^ pre[0][i]; + buf[125] = buf[127] ^ pre[1][i]; + buf[29] = buf[31] ^ pre[1][i]; + buf[124] = buf[125] ^ pre[0][i]; + buf[28] = buf[29] ^ pre[0][i]; + buf[116] = buf[124] ^ pre[3][i]; + buf[20] = buf[28] ^ pre[3][i]; + buf[117] = buf[116] ^ pre[0][i]; + buf[21] = buf[20] ^ pre[0][i]; + buf[119] = buf[117] ^ pre[1][i]; + buf[23] = buf[21] ^ pre[1][i]; + buf[118] = buf[119] ^ pre[0][i]; + buf[22] = buf[23] ^ pre[0][i]; + buf[114] = buf[118] ^ pre[2][i]; + buf[18] = buf[22] ^ pre[2][i]; + buf[115] = buf[114] ^ pre[0][i]; + buf[19] = buf[18] ^ pre[0][i]; + buf[113] = buf[115] ^ pre[1][i]; + buf[17] = buf[19] ^ pre[1][i]; + buf[112] = buf[113] ^ pre[0][i]; + buf[80] = buf[112] ^ pre[5][i]; + buf[16] = in[0][i] ^ pre[4][i]; + buf[81] = buf[80] ^ pre[0][i]; + buf[48] = buf[16] ^ pre[5][i]; + buf[83] = buf[81] ^ pre[1][i]; + buf[49] = buf[48] ^ pre[0][i]; + buf[82] = buf[83] ^ pre[0][i]; + buf[51] = buf[49] ^ pre[1][i]; + buf[86] = buf[82] ^ pre[2][i]; + buf[50] = buf[51] ^ pre[0][i]; + buf[87] = buf[86] ^ pre[0][i]; + buf[54] = buf[50] ^ pre[2][i]; + buf[85] = buf[87] ^ pre[1][i]; + buf[55] = buf[54] ^ pre[0][i]; + buf[84] = buf[85] ^ pre[0][i]; + buf[53] = buf[55] ^ pre[1][i]; + buf[92] = buf[84] ^ pre[3][i]; + buf[52] = buf[53] ^ pre[0][i]; + buf[93] = buf[92] ^ pre[0][i]; + buf[60] = buf[52] ^ pre[3][i]; + buf[95] = buf[93] ^ pre[1][i]; + buf[61] = buf[60] ^ pre[0][i]; + buf[94] = buf[95] ^ pre[0][i]; + buf[63] = buf[61] ^ pre[1][i]; + buf[90] = buf[94] ^ pre[2][i]; + buf[62] = buf[63] ^ pre[0][i]; + buf[91] = buf[90] ^ pre[0][i]; + buf[58] = buf[62] ^ pre[2][i]; + buf[89] = buf[91] ^ pre[1][i]; + buf[59] = buf[58] ^ pre[0][i]; + buf[88] = buf[89] ^ pre[0][i]; + buf[57] = buf[59] ^ pre[1][i]; + buf[72] = buf[88] ^ pre[4][i]; + buf[56] = buf[57] ^ pre[0][i]; + buf[73] = buf[72] ^ pre[0][i]; + buf[40] = buf[56] ^ pre[4][i]; + buf[75] = buf[73] ^ pre[1][i]; + buf[41] = buf[40] ^ pre[0][i]; + buf[74] = buf[75] ^ pre[0][i]; + buf[43] = buf[41] ^ pre[1][i]; + buf[78] = buf[74] ^ pre[2][i]; + buf[42] = buf[43] ^ pre[0][i]; + buf[79] = buf[78] ^ pre[0][i]; + buf[46] = buf[42] ^ pre[2][i]; + buf[77] = buf[79] ^ pre[1][i]; + buf[47] = buf[46] ^ pre[0][i]; + buf[76] = buf[77] ^ pre[0][i]; + buf[45] = buf[47] ^ pre[1][i]; + buf[68] = buf[76] ^ pre[3][i]; + buf[44] = buf[45] ^ pre[0][i]; + buf[69] = buf[68] ^ pre[0][i]; + buf[36] = buf[44] ^ pre[3][i]; + buf[71] = buf[69] ^ pre[1][i]; + buf[37] = buf[36] ^ pre[0][i]; + buf[70] = buf[71] ^ pre[0][i]; + buf[39] = buf[37] ^ pre[1][i]; + buf[66] = buf[70] ^ pre[2][i]; + buf[38] = buf[39] ^ pre[0][i]; + buf[67] = buf[66] ^ pre[0][i]; + buf[34] = buf[38] ^ pre[2][i]; + buf[65] = buf[67] ^ pre[1][i]; + buf[35] = buf[34] ^ pre[0][i]; + buf[33] = buf[35] ^ pre[1][i]; + buf[64] = in[0][i] ^ pre[6][i]; + + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(buf + 0, buf + 0); + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(buf + 64, buf + 64); + + for (j = 0; j < 128; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 1; i <= 6; i++) { + s = 1 << i; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE460896F_VEC_vec_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += ((uint64_t)1 << i); + } + + // adding the part contributed by x^128 + +// for (i = 0; i < 128; i++) +// for (b = 0; b < GFBITS; b++) +// out[i][b] ^= powers[i][b]; +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE460896F_VEC_fft(vec out[][GFBITS], vec in[][GFBITS]) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece460896f/vec/fft.h b/crypto_kem/mceliece460896f/vec/fft.h new file mode 100644 index 00000000..505fb7c0 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_FFT_H +#define PQCLEAN_MCELIECE460896F_VEC_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE460896F_VEC_fft(vec out[][ GFBITS ], vec in[][GFBITS]); + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/fft_tr.c b/crypto_kem/mceliece460896f/vec/fft_tr.c new file mode 100644 index 00000000..d68baf61 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/fft_tr.c @@ -0,0 +1,299 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec in[][ GFBITS ]) { + int i, j, k; + + const vec mask[6][2] = { + {0x2222222222222222, 0x4444444444444444}, + {0x0C0C0C0C0C0C0C0C, 0x3030303030303030}, + {0x00F000F000F000F0, 0x0F000F000F000F00}, + {0x0000FF000000FF00, 0x00FF000000FF0000}, + {0x00000000FFFF0000, 0x0000FFFF00000000}, + {0xFFFFFFFF00000000, 0x00000000FFFFFFFF} + }; + + const vec s[6][4][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE460896F_VEC_vec_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE460896F_VEC_vec_mul(in[1], in[1], s[j][1]); // scaling + PQCLEAN_MCELIECE460896F_VEC_vec_mul(in[2], in[2], s[j][2]); // scaling + PQCLEAN_MCELIECE460896F_VEC_vec_mul(in[3], in[3], s[j][3]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + in[0][i] ^= (in[0][i] & mask[k][0]) << (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][0]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][1]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][0]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][1]) << (1 << k); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[0][i] >> 32; + in[1][i] ^= in[1][i] << 32; + + in[3][i] ^= in[2][i] >> 32; + in[3][i] ^= in[3][i] << 32; + } + } + + for (i = 0; i < GFBITS; i++) { + in[3][i] ^= in[2][i] ^= in[1][i]; + } + } +} + +static void butterflies_tr(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[6][2][ GFBITS ]; + vec buf[2][64]; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 128; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 6; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + + PQCLEAN_MCELIECE460896F_VEC_vec_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tmp[b]; + } + } + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 128; k++) { + (&buf[0][0])[ k ] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(buf[0], buf[0]); + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(buf[1], buf[1]); + + for (k = 0; k < 2; k++) { + pre[0][k][i] = buf[k][32]; + buf[k][33] ^= buf[k][32]; + pre[1][k][i] = buf[k][33]; + buf[k][35] ^= buf[k][33]; + pre[0][k][i] ^= buf[k][35]; + buf[k][34] ^= buf[k][35]; + pre[2][k][i] = buf[k][34]; + buf[k][38] ^= buf[k][34]; + pre[0][k][i] ^= buf[k][38]; + buf[k][39] ^= buf[k][38]; + pre[1][k][i] ^= buf[k][39]; + buf[k][37] ^= buf[k][39]; + pre[0][k][i] ^= buf[k][37]; + buf[k][36] ^= buf[k][37]; + pre[3][k][i] = buf[k][36]; + buf[k][44] ^= buf[k][36]; + pre[0][k][i] ^= buf[k][44]; + buf[k][45] ^= buf[k][44]; + pre[1][k][i] ^= buf[k][45]; + buf[k][47] ^= buf[k][45]; + pre[0][k][i] ^= buf[k][47]; + buf[k][46] ^= buf[k][47]; + pre[2][k][i] ^= buf[k][46]; + buf[k][42] ^= buf[k][46]; + pre[0][k][i] ^= buf[k][42]; + buf[k][43] ^= buf[k][42]; + pre[1][k][i] ^= buf[k][43]; + buf[k][41] ^= buf[k][43]; + pre[0][k][i] ^= buf[k][41]; + buf[k][40] ^= buf[k][41]; + pre[4][k][i] = buf[k][40]; + buf[k][56] ^= buf[k][40]; + pre[0][k][i] ^= buf[k][56]; + buf[k][57] ^= buf[k][56]; + pre[1][k][i] ^= buf[k][57]; + buf[k][59] ^= buf[k][57]; + pre[0][k][i] ^= buf[k][59]; + buf[k][58] ^= buf[k][59]; + pre[2][k][i] ^= buf[k][58]; + buf[k][62] ^= buf[k][58]; + pre[0][k][i] ^= buf[k][62]; + buf[k][63] ^= buf[k][62]; + pre[1][k][i] ^= buf[k][63]; + buf[k][61] ^= buf[k][63]; + pre[0][k][i] ^= buf[k][61]; + buf[k][60] ^= buf[k][61]; + pre[3][k][i] ^= buf[k][60]; + buf[k][52] ^= buf[k][60]; + pre[0][k][i] ^= buf[k][52]; + buf[k][53] ^= buf[k][52]; + pre[1][k][i] ^= buf[k][53]; + buf[k][55] ^= buf[k][53]; + pre[0][k][i] ^= buf[k][55]; + buf[k][54] ^= buf[k][55]; + pre[2][k][i] ^= buf[k][54]; + buf[k][50] ^= buf[k][54]; + pre[0][k][i] ^= buf[k][50]; + buf[k][51] ^= buf[k][50]; + pre[1][k][i] ^= buf[k][51]; + buf[k][49] ^= buf[k][51]; + pre[0][k][i] ^= buf[k][49]; + buf[k][48] ^= buf[k][49]; + pre[5][k][i] = buf[k][48]; + buf[k][16] ^= buf[k][48]; + pre[0][k][i] ^= buf[k][16]; + buf[k][17] ^= buf[k][16]; + pre[1][k][i] ^= buf[k][17]; + buf[k][19] ^= buf[k][17]; + pre[0][k][i] ^= buf[k][19]; + buf[k][18] ^= buf[k][19]; + pre[2][k][i] ^= buf[k][18]; + buf[k][22] ^= buf[k][18]; + pre[0][k][i] ^= buf[k][22]; + buf[k][23] ^= buf[k][22]; + pre[1][k][i] ^= buf[k][23]; + buf[k][21] ^= buf[k][23]; + pre[0][k][i] ^= buf[k][21]; + buf[k][20] ^= buf[k][21]; + pre[3][k][i] ^= buf[k][20]; + buf[k][28] ^= buf[k][20]; + pre[0][k][i] ^= buf[k][28]; + buf[k][29] ^= buf[k][28]; + pre[1][k][i] ^= buf[k][29]; + buf[k][31] ^= buf[k][29]; + pre[0][k][i] ^= buf[k][31]; + buf[k][30] ^= buf[k][31]; + pre[2][k][i] ^= buf[k][30]; + buf[k][26] ^= buf[k][30]; + pre[0][k][i] ^= buf[k][26]; + buf[k][27] ^= buf[k][26]; + pre[1][k][i] ^= buf[k][27]; + buf[k][25] ^= buf[k][27]; + pre[0][k][i] ^= buf[k][25]; + buf[k][24] ^= buf[k][25]; + pre[4][k][i] ^= buf[k][24]; + buf[k][8] ^= buf[k][24]; + pre[0][k][i] ^= buf[k][8]; + buf[k][9] ^= buf[k][8]; + pre[1][k][i] ^= buf[k][9]; + buf[k][11] ^= buf[k][9]; + pre[0][k][i] ^= buf[k][11]; + buf[k][10] ^= buf[k][11]; + pre[2][k][i] ^= buf[k][10]; + buf[k][14] ^= buf[k][10]; + pre[0][k][i] ^= buf[k][14]; + buf[k][15] ^= buf[k][14]; + pre[1][k][i] ^= buf[k][15]; + buf[k][13] ^= buf[k][15]; + pre[0][k][i] ^= buf[k][13]; + buf[k][12] ^= buf[k][13]; + pre[3][k][i] ^= buf[k][12]; + buf[k][4] ^= buf[k][12]; + pre[0][k][i] ^= buf[k][4]; + buf[k][5] ^= buf[k][4]; + pre[1][k][i] ^= buf[k][5]; + buf[k][7] ^= buf[k][5]; + pre[0][k][i] ^= buf[k][7]; + buf[k][6] ^= buf[k][7]; + pre[2][k][i] ^= buf[k][6]; + buf[k][2] ^= buf[k][6]; + pre[0][k][i] ^= buf[k][2]; + buf[k][3] ^= buf[k][2]; + pre[1][k][i] ^= buf[k][3]; + buf[k][1] ^= buf[k][3]; + + pre[0][k][i] ^= buf[k][1]; + out[k][i] = buf[k][0] ^ buf[k][1]; + } + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896F_VEC_vec_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE460896F_VEC_vec_mul(out[2], pre[0][0], tmp); + PQCLEAN_MCELIECE460896F_VEC_vec_mul(out[3], pre[0][1], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896F_VEC_vec_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE460896F_VEC_vec_mul(pre[i][0], pre[i][0], tmp); + PQCLEAN_MCELIECE460896F_VEC_vec_mul(pre[i][1], pre[i][1], tmp); + + for (b = 0; b < GFBITS; b++) { + out[2][b] ^= pre[i][0][b]; + out[3][b] ^= pre[i][1][b]; + } + } + +} + +/* justifying the length of the output */ +static void postprocess(vec out[4][GFBITS]) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[3][i] = 0; + } +} + +void PQCLEAN_MCELIECE460896F_VEC_fft_tr(vec out[][GFBITS], vec in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/crypto_kem/mceliece460896f/vec/fft_tr.h b/crypto_kem/mceliece460896f/vec/fft_tr.h new file mode 100644 index 00000000..8296a6a6 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_FFT_TR_H +#define PQCLEAN_MCELIECE460896F_VEC_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE460896F_VEC_fft_tr(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/gf.c b/crypto_kem/mceliece460896f/vec/gf.c new file mode 100644 index 00000000..86478f6e --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/gf.c @@ -0,0 +1,205 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE460896F_VEC_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE460896F_VEC_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE460896F_VEC_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE460896F_VEC_gf_inv(gf in) { + return PQCLEAN_MCELIECE460896F_VEC_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE460896F_VEC_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[191]; + + for (i = 0; i < 191; i++) { + prod[i] = 0; + } + + for (i = 0; i < 96; i++) { + for (j = 0; j < 96; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE460896F_VEC_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 190; i >= 96; i--) { + prod[i - 85] ^= PQCLEAN_MCELIECE460896F_VEC_gf_mul(prod[i], (gf) 714); + prod[i - 91] ^= PQCLEAN_MCELIECE460896F_VEC_gf_mul(prod[i], (gf) 5296); + prod[i - 92] ^= PQCLEAN_MCELIECE460896F_VEC_gf_mul(prod[i], (gf) 728); + prod[i - 96] ^= PQCLEAN_MCELIECE460896F_VEC_gf_mul(prod[i], (gf) 5881); + } + + for (i = 0; i < 96; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece460896f/vec/gf.h b/crypto_kem/mceliece460896f/vec/gf.h new file mode 100644 index 00000000..89a29ebf --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_GF_H +#define PQCLEAN_MCELIECE460896F_VEC_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE460896F_VEC_gf_iszero(gf a); +gf PQCLEAN_MCELIECE460896F_VEC_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE460896F_VEC_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE460896F_VEC_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE460896F_VEC_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE460896F_VEC_gf_inv(gf in); + +void PQCLEAN_MCELIECE460896F_VEC_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/operations.c b/crypto_kem/mceliece460896f/vec/operations.c new file mode 100644 index 00000000..b6fa6fb5 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE460896F_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE460896F_VEC_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE460896F_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE460896F_VEC_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE460896F_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE460896F_VEC_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE460896F_VEC_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE460896F_VEC_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE460896F_VEC_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE460896F_VEC_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE460896F_VEC_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE460896F_VEC_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE460896F_VEC_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896f/vec/params.h b/crypto_kem/mceliece460896f/vec/params.h new file mode 100644 index 00000000..4314bb66 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_PARAMS_H +#define PQCLEAN_MCELIECE460896F_VEC_PARAMS_H + +#define GFBITS 13 +#define SYS_N 4608 +#define SYS_T 96 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/pk_gen.c b/crypto_kem/mceliece460896f/vec/pk_gen.c new file mode 100644 index 00000000..7aa02221 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/pk_gen.c @@ -0,0 +1,301 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec in[][GFBITS]) { + int i, j, r; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 0; r < 64; r++) { + out[i * 64 + r] <<= 1; + out[i * 64 + r] |= (in[i][j] >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec out0[][GFBITS], vec out1[][GFBITS], const uint64_t *in) { + int i, j, r; + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out1[i][j] <<= 1; + out1[i][j] |= (in[i * 64 + r] >> (j + GFBITS)) & 1; + } + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out0[i][GFBITS - 1 - j] <<= 1; + out0[i][GFBITS - 1 - j] |= (in[i * 64 + r] >> j) & 1; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + int i, b, m = 0, r = 0; + + for (i = 0; i < 64; i++) { + b = ((int)in >> i) & 1; + m |= b; + r += (m ^ 1) & (b ^ 1); + } + + return r; +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 63) / 64) ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = mat[ row + i ][ block_idx ]; + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = mat[ i + j ][ block_idx ]; + } + + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx ] = buf[j]; + } + } + + return 0; +} + +#define NBLOCKS_H ((SYS_N + 63) / 64) +#define NBLOCKS_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE460896F_VEC_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS_H ]; + + uint64_t mask; + + vec irr_int[2][ GFBITS ]; + + vec consts[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + vec prod[ 128 ][ GFBITS ]; + vec tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE460896F_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE460896F_VEC_fft(eval, irr_int); + + PQCLEAN_MCELIECE460896F_VEC_vec_copy(prod[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE460896F_VEC_vec_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896F_VEC_vec_inv(tmp, prod[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE460896F_VEC_vec_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE460896F_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896F_VEC_vec_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE460896F_VEC_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS_H; j++) { + PQCLEAN_MCELIECE460896F_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + PQCLEAN_MCELIECE460896F_VEC_store_i(pk, mat[i][ NBLOCKS_I - 1 ] >> tail, (64 - tail) / 8); + pk += (64 - tail) / 8; + + for (j = NBLOCKS_I; j < NBLOCKS_H; j++) { + PQCLEAN_MCELIECE460896F_VEC_store8(pk, mat[i][j]); + pk += 8; + } + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece460896f/vec/pk_gen.h b/crypto_kem/mceliece460896f/vec/pk_gen.h new file mode 100644 index 00000000..eb18ff5d --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_PK_GEN_H +#define PQCLEAN_MCELIECE460896F_VEC_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE460896F_VEC_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/scalars_2x.inc b/crypto_kem/mceliece460896f/vec/scalars_2x.inc new file mode 100644 index 00000000..a0abb162 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/scalars_2x.inc @@ -0,0 +1,150 @@ +{{ + 0X3C3CF30C0000C003, + 0X0CCCC3F333C0000C, + 0X03C33F33FCC0C03C, + 0X0003000F3C03C0C0, + 0XF33FF33030CF03F0, + 0X0CF0303300F0CCC0, + 0XFF3F0C0CC0FF3CC0, + 0XCF3CF0FF003FC000, + 0XC00FF3CF0303F300, + 0X3CCC0CC00CF0CC00, + 0XF30FFC3C3FCCFC00, + 0X3F0FC3F0CCF0C000, + 0X3000FF33CCF0F000 +}, +{ + 0X0C0F0FCF0F0CF330, + 0XF0000FC33C3CCF3C, + 0X3C0F3F00C3C300FC, + 0X3C33CCC0F0F3CC30, + 0XC0CFFFFFCCCC30CC, + 0X3FC3F3CCFFFC033F, + 0XFC3030CCCCC0CFCF, + 0X0FCF0C00CCF333C3, + 0XCFFCF33000CFF030, + 0X00CFFCC330F30FCC, + 0X3CCC3FCCC0F3FFF3, + 0XF00F0C3FC003C0FF, + 0X330CCFCC03C0FC33 +}}, +{{ + 0X0F0F0FF0F000000F, + 0X00FFFFFFFF0000F0, + 0XFFFF00FF00000F00, + 0XFFF000F00F0FF000, + 0XFFF0000F0FF000F0, + 0X00FF000FFF000000, + 0XFF0F0FFF0F0FF000, + 0X0FFF0000000F0000, + 0X00F000F0FFF00F00, + 0X00F00FF00F00F000, + 0XFFF000F000F00000, + 0X00F00F000FF00000, + 0X0000FF0F0000F000 +}, +{ + 0XF0FFFFFFF0F00F00, + 0X00FFF0FFFF0000FF, + 0X00FF00000F0F0FFF, + 0XF000F0000F00FF0F, + 0XFF000000FFF00000, + 0XF0FF000FF00F0FF0, + 0X0F0F0F00FF000F0F, + 0X0F0F00F0F0F0F000, + 0X00F00F00F00F000F, + 0X00F0F0F00000FFF0, + 0XFFFFFF0FF00F0FFF, + 0X0F0FFFF00FFFFFFF, + 0XFFFF0F0FFF0FFF00 +}}, +{{ + 0X00FF0000000000FF, + 0XFFFFFFFFFF00FF00, + 0XFF0000FF00FF0000, + 0XFFFF000000FF0000, + 0XFF00000000FF0000, + 0X00FFFFFFFF000000, + 0XFF0000FFFFFF0000, + 0XFF00FF00FFFF0000, + 0X00FFFFFFFF00FF00, + 0XFFFF000000000000, + 0X00FF0000FF000000, + 0XFF00FF00FF000000, + 0X00FF00FFFF000000 +}, +{ + 0X00FF00FF00FF0000, + 0XFF00FFFF000000FF, + 0X0000FFFF000000FF, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF00FF, + 0X0000FFFF00FFFF00, + 0XFF00FF0000FFFF00, + 0X00000000FFFFFFFF, + 0X0000FF0000000000, + 0XFF00FFFF00FFFF00, + 0X00FFFF00000000FF, + 0X0000FF00FF00FFFF, + 0XFF0000FFFFFF0000 +}}, +{{ + 0X000000000000FFFF, + 0XFFFFFFFFFFFF0000, + 0X0000000000000000, + 0XFFFF0000FFFF0000, + 0XFFFFFFFFFFFF0000, + 0X0000FFFF00000000, + 0X0000FFFFFFFF0000, + 0XFFFF0000FFFF0000, + 0X0000FFFF00000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000FFFF00000000, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0X0000FFFF00000000, + 0XFFFF0000FFFF0000, + 0X0000FFFFFFFF0000, + 0X0000FFFF0000FFFF, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFF0000, + 0XFFFF0000FFFFFFFF, + 0XFFFF0000FFFFFFFF, + 0X0000000000000000 +}}, +{{ + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000 +}} diff --git a/crypto_kem/mceliece460896f/vec/scalars_4x.inc b/crypto_kem/mceliece460896f/vec/scalars_4x.inc new file mode 100644 index 00000000..cbaccec7 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/scalars_4x.inc @@ -0,0 +1,360 @@ +{{ + 0x3C3CF30C0000C003, + 0x0CCCC3F333C0000C, + 0x03C33F33FCC0C03C, + 0x0003000F3C03C0C0, + 0xF33FF33030CF03F0, + 0x0CF0303300F0CCC0, + 0xFF3F0C0CC0FF3CC0, + 0xCF3CF0FF003FC000, + 0xC00FF3CF0303F300, + 0x3CCC0CC00CF0CC00, + 0xF30FFC3C3FCCFC00, + 0x3F0FC3F0CCF0C000, + 0x3000FF33CCF0F000 +}, +{ + 0x0C0F0FCF0F0CF330, + 0xF0000FC33C3CCF3C, + 0x3C0F3F00C3C300FC, + 0x3C33CCC0F0F3CC30, + 0xC0CFFFFFCCCC30CC, + 0x3FC3F3CCFFFC033F, + 0xFC3030CCCCC0CFCF, + 0x0FCF0C00CCF333C3, + 0xCFFCF33000CFF030, + 0x00CFFCC330F30FCC, + 0x3CCC3FCCC0F3FFF3, + 0xF00F0C3FC003C0FF, + 0x330CCFCC03C0FC33 +}, +{ + 0xF0F30C33CF03F03F, + 0x00F30FC00C3300FF, + 0xF3CC3CF3F3FCF33F, + 0x3C0FC0FC303C3F3C, + 0xFC30CF303F3FF00F, + 0x33300C0CC3300CF3, + 0x3C030CF3F03FF3F3, + 0x3CCC03FCCC3FFC03, + 0x033C3C3CF0003FC3, + 0xFFC0FF00F0FF0F03, + 0xF3F30CF003FCC303, + 0x30CFCFC3CC0F3000, + 0x0CF30CCF3FCFCC0F +}, +{ + 0x3F30CC0C000F3FCC, + 0xFC3CF030FC3FFF03, + 0x33FFFCFF0CCF3CC3, + 0x003CFF33C3CC30CF, + 0xCFF3CF33C00F3003, + 0x00F3CC0CF3003CCF, + 0x3C000CFCCC3C3333, + 0xF3CF03C0FCF03FF0, + 0x3F3C3CF0C330330C, + 0x33CCFCC0FF0033F0, + 0x33C300C0F0C003F3, + 0x003FF0003F00C00C, + 0xCFF3C3033F030FFF +}}, +{{ + 0x0F0F0FF0F000000F, + 0x00FFFFFFFF0000F0, + 0xFFFF00FF00000F00, + 0xFFF000F00F0FF000, + 0xFFF0000F0FF000F0, + 0x00FF000FFF000000, + 0xFF0F0FFF0F0FF000, + 0x0FFF0000000F0000, + 0x00F000F0FFF00F00, + 0x00F00FF00F00F000, + 0xFFF000F000F00000, + 0x00F00F000FF00000, + 0x0000FF0F0000F000 +}, +{ + 0xF0FFFFFFF0F00F00, + 0x00FFF0FFFF0000FF, + 0x00FF00000F0F0FFF, + 0xF000F0000F00FF0F, + 0xFF000000FFF00000, + 0xF0FF000FF00F0FF0, + 0x0F0F0F00FF000F0F, + 0x0F0F00F0F0F0F000, + 0x00F00F00F00F000F, + 0x00F0F0F00000FFF0, + 0xFFFFFF0FF00F0FFF, + 0x0F0FFFF00FFFFFFF, + 0xFFFF0F0FFF0FFF00 +}, +{ + 0x0F0F00FF0FF0FFFF, + 0xF000F0F00F00FF0F, + 0x000FFFF0FFF0FF0F, + 0x00F00FFF00000FF0, + 0xFFFFF0000FFFF00F, + 0xFFF0FFF0000FFFF0, + 0xF0F0F0000F0F0F00, + 0x00F000F0F00FFF00, + 0xF0FF0F0FFF00F0FF, + 0xF0FF0FFFF0F0F0FF, + 0x00FFFFFFFFFFFFF0, + 0x00FFF0F0FF000F0F, + 0x000FFFF0000FFF00 +}, +{ + 0xFF0F0F00F000F0FF, + 0x0FFFFFFFFF00000F, + 0xF0FFFF000F00F0FF, + 0x0F0000F00FFF0FFF, + 0x0F0F0F00FF0F000F, + 0x000F0F0FFFF0F000, + 0xF0FFFF0F00F0FF0F, + 0x0F0F000F0F00F0FF, + 0x0000F0FF00FF0F0F, + 0x00FFFF0FF0FFF0F0, + 0x0000000F00F0FFF0, + 0xF0F00000FF00F0F0, + 0x0F0F0FFFFFFFFFFF +}}, +{{ + 0x00FF0000000000FF, + 0xFFFFFFFFFF00FF00, + 0xFF0000FF00FF0000, + 0xFFFF000000FF0000, + 0xFF00000000FF0000, + 0x00FFFFFFFF000000, + 0xFF0000FFFFFF0000, + 0xFF00FF00FFFF0000, + 0x00FFFFFFFF00FF00, + 0xFFFF000000000000, + 0x00FF0000FF000000, + 0xFF00FF00FF000000, + 0x00FF00FFFF000000 +}, +{ + 0x00FF00FF00FF0000, + 0xFF00FFFF000000FF, + 0x0000FFFF000000FF, + 0x00FFFF00FF000000, + 0xFFFFFF0000FF00FF, + 0x0000FFFF00FFFF00, + 0xFF00FF0000FFFF00, + 0x00000000FFFFFFFF, + 0x0000FF0000000000, + 0xFF00FFFF00FFFF00, + 0x00FFFF00000000FF, + 0x0000FF00FF00FFFF, + 0xFF0000FFFFFF0000 +}, +{ + 0xFFFF00FF00FF00FF, + 0x00FFFF000000FF00, + 0xFFFF00FFFFFFFF00, + 0x0000FFFF00FFFFFF, + 0x00FF0000FF0000FF, + 0xFFFF0000FF00FFFF, + 0xFF000000FFFFFF00, + 0x000000000000FFFF, + 0xFF00FF00FFFF0000, + 0xFFFF00FFFF00FFFF, + 0xFFFFFFFFFF00FF00, + 0xFFFF00FFFF0000FF, + 0x0000FF00000000FF +}, +{ + 0xFF0000FFFFFF00FF, + 0xFFFF0000FFFFFFFF, + 0xFFFF000000FFFFFF, + 0x00FFFF00FF0000FF, + 0xFFFFFF00FFFFFF00, + 0x00FFFF00FFFF00FF, + 0x0000FFFF00FF0000, + 0x000000FFFF000000, + 0xFF00FF0000FF00FF, + 0x00FF0000000000FF, + 0xFF00FFFF00FF00FF, + 0xFFFFFFFFFFFFFFFF, + 0x0000FF000000FFFF +}}, +{{ + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0x0000000000000000, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFF0000, + 0x0000FFFF00000000, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFF0000, + 0x0000FFFF00000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000FFFF00000000, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000FFFF00000000, + 0xFFFF0000FFFF0000, + 0x0000FFFFFFFF0000, + 0x0000FFFF0000FFFF, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFF0000, + 0xFFFF0000FFFFFFFF, + 0xFFFF0000FFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF000000000000, + 0x0000FFFF00000000, + 0x00000000FFFF0000, + 0x0000FFFFFFFFFFFF, + 0x0000FFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0x000000000000FFFF, + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0xFFFFFFFF0000FFFF, + 0xFFFF0000FFFFFFFF +}, +{ + 0x0000FFFFFFFFFFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFFFFFF, + 0x00000000FFFF0000, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFF00000000, + 0xFFFFFFFF00000000, + 0x0000FFFFFFFF0000, + 0x0000FFFFFFFFFFFF +}}, +{{ + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000 +}, +{ + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000 +}}, +{{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF +}}, diff --git a/crypto_kem/mceliece460896f/vec/sk_gen.c b/crypto_kem/mceliece460896f/vec/sk_gen.c new file mode 100644 index 00000000..09fd4de1 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE460896F_VEC_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE460896F_VEC_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE460896F_VEC_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE460896F_VEC_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE460896F_VEC_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE460896F_VEC_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE460896F_VEC_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE460896F_VEC_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece460896f/vec/sk_gen.h b/crypto_kem/mceliece460896f/vec/sk_gen.h new file mode 100644 index 00000000..0fe45e81 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_SK_GEN_H +#define PQCLEAN_MCELIECE460896F_VEC_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE460896F_VEC_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE460896F_VEC_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/transpose.c b/crypto_kem/mceliece460896f/vec/transpose.c new file mode 100644 index 00000000..e72c6f3e --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/transpose.c @@ -0,0 +1,35 @@ +#include "transpose.h" + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} diff --git a/crypto_kem/mceliece460896f/vec/transpose.h b/crypto_kem/mceliece460896f/vec/transpose.h new file mode 100644 index 00000000..f5f026d0 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/transpose.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_TRANSPOSE_H +#define PQCLEAN_MCELIECE460896F_VEC_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + + +void PQCLEAN_MCELIECE460896F_VEC_transpose_64x64(uint64_t *out, const uint64_t *in); + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/util.c b/crypto_kem/mceliece460896f/vec/util.c new file mode 100644 index 00000000..22ab6ab7 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/util.c @@ -0,0 +1,97 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ +#include "util.h" + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE460896F_VEC_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE460896F_VEC_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE460896F_VEC_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE460896F_VEC_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE460896F_VEC_irr_load(vec out[][GFBITS], const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE460896F_VEC_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[0][i] = v[0]; + out[1][i] = v[1]; + } +} + +void PQCLEAN_MCELIECE460896F_VEC_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE460896F_VEC_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} diff --git a/crypto_kem/mceliece460896f/vec/util.h b/crypto_kem/mceliece460896f/vec/util.h new file mode 100644 index 00000000..699ba62b --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/util.h @@ -0,0 +1,27 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_UTIL_H +#define PQCLEAN_MCELIECE460896F_VEC_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE460896F_VEC_store_i(unsigned char *out, uint64_t in, int i); + +void PQCLEAN_MCELIECE460896F_VEC_store2(unsigned char *dest, uint16_t a); + +uint16_t PQCLEAN_MCELIECE460896F_VEC_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE460896F_VEC_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE460896F_VEC_irr_load(vec out[][GFBITS], const unsigned char *in); + +void PQCLEAN_MCELIECE460896F_VEC_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE460896F_VEC_load8(const unsigned char *in); + +#endif + diff --git a/crypto_kem/mceliece460896f/vec/vec.c b/crypto_kem/mceliece460896f/vec/vec.c new file mode 100644 index 00000000..4416206a --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/vec.c @@ -0,0 +1,139 @@ +#include "vec.h" + +#include "params.h" + +vec PQCLEAN_MCELIECE460896F_VEC_vec_setbits(vec b) { + vec ret = -b; + + return ret; +} + +vec PQCLEAN_MCELIECE460896F_VEC_vec_set1_16b(uint16_t v) { + vec ret; + + ret = v; + ret |= ret << 16; + ret |= ret << 32; + + return ret; +} + +void PQCLEAN_MCELIECE460896F_VEC_vec_copy(vec *out, const vec *in) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i] = in[i]; + } +} + +vec PQCLEAN_MCELIECE460896F_VEC_vec_or_reduce(const vec *a) { + int i; + vec ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret |= a[i]; + } + + return ret; +} + +int PQCLEAN_MCELIECE460896F_VEC_vec_testz(vec a) { + a |= a >> 32; + a |= a >> 16; + a |= a >> 8; + a |= a >> 4; + a |= a >> 2; + a |= a >> 1; + + return ((int)a & 1) ^ 1; +} + + +void PQCLEAN_MCELIECE460896F_VEC_vec_mul(vec *h, const vec *f, const vec *g) { + int i, j; + vec buf[ 2 * GFBITS - 1 ]; + + for (i = 0; i < 2 * GFBITS - 1; i++) { + buf[i] = 0; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < GFBITS; j++) { + buf[i + j] ^= f[i] & g[j]; + } + } + + for (i = 2 * GFBITS - 2; i >= GFBITS; i--) { + buf[i - GFBITS + 4] ^= buf[i]; + buf[i - GFBITS + 3] ^= buf[i]; + buf[i - GFBITS + 1] ^= buf[i]; + buf[i - GFBITS + 0] ^= buf[i]; + } + + for (i = 0; i < GFBITS; i++) { + h[i] = buf[i]; + } +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE460896F_VEC_vec_sq(vec *out, const vec *in) { + int i; + vec result[GFBITS], t; + + t = in[11] ^ in[12]; + + result[0] = in[0] ^ in[11]; + result[1] = in[7] ^ t; + result[2] = in[1] ^ in[7]; + result[3] = in[8] ^ t; + result[4] = in[2] ^ in[7]; + result[4] = result[4] ^ in[8]; + result[4] = result[4] ^ t; + result[5] = in[7] ^ in[9]; + result[6] = in[3] ^ in[8]; + result[6] = result[6] ^ in[9]; + result[6] = result[6] ^ in[12]; + result[7] = in[8] ^ in[10]; + result[8] = in[4] ^ in[9]; + result[8] = result[8] ^ in[10]; + result[9] = in[9] ^ in[11]; + result[10] = in[5] ^ in[10]; + result[10] = result[10] ^ in[11]; + result[11] = in[10] ^ in[12]; + result[12] = in[6] ^ t; + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE460896F_VEC_vec_inv(vec *out, const vec *in) { + vec tmp_11[ GFBITS ]; + vec tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE460896F_VEC_vec_copy(out, in); + + PQCLEAN_MCELIECE460896F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896F_VEC_vec_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE460896F_VEC_vec_sq(out, tmp_11); + PQCLEAN_MCELIECE460896F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896F_VEC_vec_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE460896F_VEC_vec_sq(out, tmp_1111); + PQCLEAN_MCELIECE460896F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896F_VEC_vec_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE460896F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE460896F_VEC_vec_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE460896F_VEC_vec_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece460896f/vec/vec.h b/crypto_kem/mceliece460896f/vec/vec.h new file mode 100644 index 00000000..2f385ea4 --- /dev/null +++ b/crypto_kem/mceliece460896f/vec/vec.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE460896F_VEC_VEC_H +#define PQCLEAN_MCELIECE460896F_VEC_VEC_H + +#include "params.h" + +#include + +typedef uint64_t vec; + +vec PQCLEAN_MCELIECE460896F_VEC_vec_setbits(vec b); + +vec PQCLEAN_MCELIECE460896F_VEC_vec_set1_16b(uint16_t v); + +void PQCLEAN_MCELIECE460896F_VEC_vec_copy(vec *out, const vec *in); + +vec PQCLEAN_MCELIECE460896F_VEC_vec_or_reduce(const vec *a); + +int PQCLEAN_MCELIECE460896F_VEC_vec_testz(vec a); + +void PQCLEAN_MCELIECE460896F_VEC_vec_mul(vec * /*h*/, const vec * /*f*/, const vec * /*g*/); +void PQCLEAN_MCELIECE460896F_VEC_vec_sq(vec * /*out*/, const vec * /*in*/); +void PQCLEAN_MCELIECE460896F_VEC_vec_inv(vec * /*out*/, const vec * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/META.yml b/crypto_kem/mceliece6688128/META.yml new file mode 100644 index 00000000..1c1b80f5 --- /dev/null +++ b/crypto_kem/mceliece6688128/META.yml @@ -0,0 +1,48 @@ +name: Classic McEliece 6688128 +type: kem +claimed-nist-level: 5 +claimed-security: IND-CCA2 +length-public-key: 1044992 +length-secret-key: 13892 +length-ciphertext: 240 +length-shared-secret: 32 +nistkat-sha256: 2946eb61d1505967d2ba223ff64c9baadbefa18ec6849fcbc068c0348a39f6f8 +principal-submitters: + - Daniel J. Bernstein + - Tung Chou + - Tanja Lange + - Ingo von Maurich + - Rafael Misoczki + - Ruben Niederhagen + - Edoardo Persichetti + - Christiane Peters + - Peter Schwabe + - Nicolas Sendrier + - Jakub Szefer + - Wen Wang +auxiliary-submitters: [] +implementations: + - name: clean + version: SUPERCOP-20191221 + - name: vec + version: SUPERCOP-20191221 + - name: sse + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - sse4_1 + - popcnt + - name: avx + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - popcnt diff --git a/crypto_kem/mceliece6688128/avx/LICENSE b/crypto_kem/mceliece6688128/avx/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece6688128/avx/Makefile b/crypto_kem/mceliece6688128/avx/Makefile new file mode 100644 index 00000000..79067a89 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/Makefile @@ -0,0 +1,44 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece6688128_avx.a + + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c int32_sort.c operations.c pk_gen.c sk_gen.c \ + transpose.c uint32_sort.c util.c vec128.c vec256.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S \ + transpose_64x256_sp_asm.S update_asm.S vec128_mul_asm.S \ + vec256_ama_asm.S vec256_maa_asm.S vec256_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h int32_sort.h \ + params.h pk_gen.h sk_gen.h transpose.h uint32_sort.h util.h vec128.h \ + vec256.h \ + consts.inc powers.inc scalars_2x.inc scalars_4x.inc + + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o int32_sort.o operations.o pk_gen.o sk_gen.o \ + transpose.o uint32_sort.o util.o vec128.o vec256.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + transpose_64x256_sp_asm.o update_asm.o vec128_mul_asm.o \ + vec256_ama_asm.o vec256_maa_asm.o vec256_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mpopcnt -mavx2 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece6688128/avx/aes256ctr.c b/crypto_kem/mceliece6688128/avx/aes256ctr.c new file mode 100644 index 00000000..3647ad8c --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE6688128_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece6688128/avx/aes256ctr.h b/crypto_kem/mceliece6688128/avx/aes256ctr.h new file mode 100644 index 00000000..ece3bd7a --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_AES256CTR_H +#define PQCLEAN_MCELIECE6688128_AVX_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6688128_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6688128/avx/api.h b/crypto_kem/mceliece6688128/avx/api.h new file mode 100644 index 00000000..51213f9d --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_API_H +#define PQCLEAN_MCELIECE6688128_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE6688128_AVX_CRYPTO_ALGNAME "Classic McEliece 6688128" +#define PQCLEAN_MCELIECE6688128_AVX_CRYPTO_PUBLICKEYBYTES 1044992 +#define PQCLEAN_MCELIECE6688128_AVX_CRYPTO_SECRETKEYBYTES 13892 +#define PQCLEAN_MCELIECE6688128_AVX_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE6688128_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece6688128/avx/benes.c b/crypto_kem/mceliece6688128/avx/benes.c new file mode 100644 index 00000000..937cfce1 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE6688128_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE6688128_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(PQCLEAN_MCELIECE6688128_AVX_load8(ptr), PQCLEAN_MCELIECE6688128_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE6688128_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(PQCLEAN_MCELIECE6688128_AVX_load8(ptr), PQCLEAN_MCELIECE6688128_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6688128_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece6688128/avx/benes.h b/crypto_kem/mceliece6688128/avx/benes.h new file mode 100644 index 00000000..9836478b --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_BENES_H +#define PQCLEAN_MCELIECE6688128_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE6688128_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE6688128_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/avx/bm.c b/crypto_kem/mceliece6688128/avx/bm.c new file mode 100644 index 00000000..86366224 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/bm.c @@ -0,0 +1,214 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE6688128_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE6688128_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE6688128_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE6688128_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6688128_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE6688128_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE6688128_AVX_vec256_or(PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE6688128_AVX_vec256_sll_4x(PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE6688128_AVX_vec256_or(PQCLEAN_MCELIECE6688128_AVX_vec256_srl_4x(PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6688128_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + vec128 db[ GFBITS ][ 2 ]; + vec128 BC_tmp[ GFBITS ][ 2 ]; + vec128 BC[ GFBITS ][ 2 ]; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC[0][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0, one << 63); + BC[0][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_setzero(); + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_setzero(); + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm(prod, interval, BC[0] + 1, 32); + PQCLEAN_MCELIECE6688128_AVX_update_asm(interval, coefs[N], 16); + + d = PQCLEAN_MCELIECE6688128_AVX_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE6688128_AVX_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((d >> i) & 1); + db[i][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_mul((vec256 *) BC_tmp, (vec256 *) db, (vec256 *) BC); + + vec128_cmov(BC, mask); + PQCLEAN_MCELIECE6688128_AVX_update_asm(BC, c0 & mask, 32); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(BC_tmp[i][0], BC_tmp[i][1]); + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE6688128_AVX_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + prod[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm(out, prod, BC[0] + 1, 32); +} + diff --git a/crypto_kem/mceliece6688128/avx/bm.h b/crypto_kem/mceliece6688128/avx/bm.h new file mode 100644 index 00000000..fc28b9a9 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_BM_H +#define PQCLEAN_MCELIECE6688128_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6688128_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/avx/consts.S b/crypto_kem/mceliece6688128/avx/consts.S new file mode 100644 index 00000000..74691b8c --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE6688128_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE6688128_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE6688128_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE6688128_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE6688128_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE6688128_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE6688128_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE6688128_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE6688128_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE6688128_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE6688128_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE6688128_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE6688128_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece6688128/avx/consts.inc b/crypto_kem/mceliece6688128/avx/consts.inc new file mode 100644 index 00000000..72f800bd --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/crypto_kem/mceliece6688128/avx/controlbits.c b/crypto_kem/mceliece6688128/avx/controlbits.c new file mode 100644 index 00000000..4151542b --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6688128_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6688128_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6688128_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6688128_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6688128/avx/controlbits.h b/crypto_kem/mceliece6688128/avx/controlbits.h new file mode 100644 index 00000000..4e225daf --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE6688128_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6688128_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6688128_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6688128/avx/crypto_hash.h b/crypto_kem/mceliece6688128/avx/crypto_hash.h new file mode 100644 index 00000000..ff4d88bc --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6688128_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6688128/avx/decrypt.c b/crypto_kem/mceliece6688128/avx/decrypt.c new file mode 100644 index 00000000..4bdb61a3 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/decrypt.c @@ -0,0 +1,234 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE6688128_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6688128_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6688128_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE6688128_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE6688128_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE6688128_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6688128_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6688128_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6688128_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec256_or(diff, PQCLEAN_MCELIECE6688128_AVX_vec256_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE6688128_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6688128_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 32 ][ GFBITS ]; + vec256 scaled[ 32 ][ GFBITS ]; + vec256 eval[32][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE6688128_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE6688128_AVX_benes(recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE6688128_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6688128_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE6688128_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE6688128_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE6688128_AVX_benes(error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece6688128/avx/decrypt.h b/crypto_kem/mceliece6688128/avx/decrypt.h new file mode 100644 index 00000000..79cf19fa --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE6688128_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6688128_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/avx/encrypt.c b/crypto_kem/mceliece6688128/avx/encrypt.c new file mode 100644 index 00000000..4ba2d831 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/encrypt.c @@ -0,0 +1,104 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE6688128_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE6688128_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6688128_AVX_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6688128_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + PQCLEAN_MCELIECE6688128_AVX_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece6688128/avx/encrypt.h b/crypto_kem/mceliece6688128/avx/encrypt.h new file mode 100644 index 00000000..85f14f63 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE6688128_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6688128_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/avx/fft.c b/crypto_kem/mceliece6688128/avx/fft.c new file mode 100644 index 00000000..821976b6 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/fft.c @@ -0,0 +1,275 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE6688128_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE6688128_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6688128_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec256 powers[ 32 ][ GFBITS ] = { +#include "powers.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE6688128_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + + // transpose + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 32; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(out[i][b], powers[i][b]); + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6688128_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece6688128/avx/fft.h b/crypto_kem/mceliece6688128/avx/fft.h new file mode 100644 index 00000000..bde52a78 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_FFT_H +#define PQCLEAN_MCELIECE6688128_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE6688128_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/avx/fft_tr.c b/crypto_kem/mceliece6688128/avx/fft_tr.c new file mode 100644 index 00000000..d4387000 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/fft_tr.c @@ -0,0 +1,379 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +void PQCLEAN_MCELIECE6688128_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece6688128/avx/fft_tr.h b/crypto_kem/mceliece6688128/avx/fft_tr.h new file mode 100644 index 00000000..2943d396 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE6688128_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6688128_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6688128/avx/gf.c b/crypto_kem/mceliece6688128/avx/gf.c new file mode 100644 index 00000000..e1d66bcd --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* field multiplication */ +gf PQCLEAN_MCELIECE6688128_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE6688128_AVX_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6688128_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6688128_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE6688128_AVX_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6688128_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6688128_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6688128_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE6688128_AVX_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE6688128_AVX_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE6688128_AVX_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6688128/avx/gf.h b/crypto_kem/mceliece6688128/avx/gf.h new file mode 100644 index 00000000..f38ce690 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/gf.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_GF_H +#define PQCLEAN_MCELIECE6688128_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6688128_AVX_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE6688128_AVX_gf_mul(gf /*in0*/, gf /*in1*/); +uint64_t PQCLEAN_MCELIECE6688128_AVX_gf_mul2(gf /*a*/, gf /*b0*/, gf /*b1*/); +gf PQCLEAN_MCELIECE6688128_AVX_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE6688128_AVX_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE6688128_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/avx/int32_sort.c b/crypto_kem/mceliece6688128/avx/int32_sort.c new file mode 100644 index 00000000..0a844783 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/int32_sort.c @@ -0,0 +1,1208 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = (p << 1 == q); + flipflip = !flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE6688128_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE6688128_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/crypto_kem/mceliece6688128/avx/int32_sort.h b/crypto_kem/mceliece6688128/avx/int32_sort.h new file mode 100644 index 00000000..049dae16 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE6688128_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE6688128_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece6688128/avx/operations.c b/crypto_kem/mceliece6688128/avx/operations.c new file mode 100644 index 00000000..c9b9587f --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6688128_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6688128_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6688128_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6688128_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6688128_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6688128_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6688128_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6688128_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6688128_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6688128_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128/avx/params.h b/crypto_kem/mceliece6688128/avx/params.h new file mode 100644 index 00000000..cc604978 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_PARAMS_H +#define PQCLEAN_MCELIECE6688128_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6688 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6688128/avx/pk_gen.c b/crypto_kem/mceliece6688128/avx/pk_gen.c new file mode 100644 index 00000000..d4eff257 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/pk_gen.c @@ -0,0 +1,286 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256) +int PQCLEAN_MCELIECE6688128_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS1_I; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS1_I ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE6688128_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6688128_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE6688128_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6688128_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_I; j++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS1_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + uint64_t column[ PK_NROWS ]; + + for (i = 0; i < PK_NROWS; i++) { + column[i] = mat[ i ][ block_idx ]; + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + for (i = 0; i < PK_NROWS; i++) { + mat[ i ][ block_idx ] = column[i]; + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = 0; k < NBLOCKS1_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + PQCLEAN_MCELIECE6688128_AVX_store8(pk, one_row[k]); + pk += 8; + } + + PQCLEAN_MCELIECE6688128_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece6688128/avx/pk_gen.h b/crypto_kem/mceliece6688128/avx/pk_gen.h new file mode 100644 index 00000000..479d007c --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE6688128_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6688128_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/avx/powers.inc b/crypto_kem/mceliece6688128/avx/powers.inc new file mode 100644 index 00000000..1c35627a --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/powers.inc @@ -0,0 +1,480 @@ +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +} diff --git a/crypto_kem/mceliece6688128/avx/scalars_2x.inc b/crypto_kem/mceliece6688128/avx/scalars_2x.inc new file mode 100644 index 00000000..5f405cad --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece6688128/avx/scalars_4x.inc b/crypto_kem/mceliece6688128/avx/scalars_4x.inc new file mode 100644 index 00000000..ce7ae950 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/crypto_kem/mceliece6688128/avx/sk_gen.c b/crypto_kem/mceliece6688128/avx/sk_gen.c new file mode 100644 index 00000000..da99dae5 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6688128_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6688128_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6688128_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6688128_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6688128_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6688128_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6688128_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6688128_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128/avx/sk_gen.h b/crypto_kem/mceliece6688128/avx/sk_gen.h new file mode 100644 index 00000000..1e5c04e2 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE6688128_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6688128_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6688128_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/avx/syndrome_asm.S b/crypto_kem/mceliece6688128/avx/syndrome_asm.S new file mode 100644 index 00000000..2152decf --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/syndrome_asm.S @@ -0,0 +1,810 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128_AVX_syndrome_asm +.global PQCLEAN_MCELIECE6688128_AVX_syndrome_asm +_PQCLEAN_MCELIECE6688128_AVX_syndrome_asm: +PQCLEAN_MCELIECE6688128_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 1044364 +# asm 1: add $1044364,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1664 +# asm 1: mov $1664,>row=int64#5 +# asm 2: mov $1664,>row=%r8 +mov $1664,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 208 ] +# asm 1: vmovupd 208(ee=reg256#2 +# asm 2: vmovupd 208(ee=%ymm1 +vmovupd 208(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 240 ] +# asm 1: vmovupd 240(ee=reg256#3 +# asm 2: vmovupd 240(ee=%ymm2 +vmovupd 240(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 272 ] +# asm 1: vmovupd 272(ee=reg256#3 +# asm 2: vmovupd 272(ee=%ymm2 +vmovupd 272(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 304 ] +# asm 1: vmovupd 304(ee=reg256#3 +# asm 2: vmovupd 304(ee=%ymm2 +vmovupd 304(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 336 ] +# asm 1: vmovupd 336(ee=reg256#3 +# asm 2: vmovupd 336(ee=%ymm2 +vmovupd 336(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 368 ] +# asm 1: vmovupd 368(ee=reg256#3 +# asm 2: vmovupd 368(ee=%ymm2 +vmovupd 368(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 400 ] +# asm 1: vmovupd 400(ee=reg256#3 +# asm 2: vmovupd 400(ee=%ymm2 +vmovupd 400(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 432 ] +# asm 1: vmovupd 432(ee=reg256#3 +# asm 2: vmovupd 432(ee=%ymm2 +vmovupd 432(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 464 ] +# asm 1: vmovupd 464(ee=reg256#3 +# asm 2: vmovupd 464(ee=%ymm2 +vmovupd 464(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 496 ] +# asm 1: vmovupd 496(ee=reg256#3 +# asm 2: vmovupd 496(ee=%ymm2 +vmovupd 496(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 528 ] +# asm 1: vmovupd 528(ee=reg256#3 +# asm 2: vmovupd 528(ee=%ymm2 +vmovupd 528(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 560 ] +# asm 1: vmovupd 560(ee=reg256#3 +# asm 2: vmovupd 560(ee=%ymm2 +vmovupd 560(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 592 ] +# asm 1: vmovupd 592(ee=reg256#3 +# asm 2: vmovupd 592(ee=%ymm2 +vmovupd 592(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 416(pp=%ymm1 +vmovupd 416(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 624 ] +# asm 1: vmovupd 624(ee=reg256#3 +# asm 2: vmovupd 624(ee=%ymm2 +vmovupd 624(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 448(pp=%ymm1 +vmovupd 448(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 656 ] +# asm 1: vmovupd 656(ee=reg256#3 +# asm 2: vmovupd 656(ee=%ymm2 +vmovupd 656(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 480(pp=%ymm1 +vmovupd 480(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 688 ] +# asm 1: vmovupd 688(ee=reg256#3 +# asm 2: vmovupd 688(ee=%ymm2 +vmovupd 688(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 512(pp=%ymm1 +vmovupd 512(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 720 ] +# asm 1: vmovupd 720(ee=reg256#3 +# asm 2: vmovupd 720(ee=%ymm2 +vmovupd 720(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 544(pp=%ymm1 +vmovupd 544(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 752 ] +# asm 1: vmovupd 752(ee=reg256#3 +# asm 2: vmovupd 752(ee=%ymm2 +vmovupd 752(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 576(pp=%ymm1 +vmovupd 576(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 784 ] +# asm 1: vmovupd 784(ee=reg256#3 +# asm 2: vmovupd 784(ee=%ymm2 +vmovupd 784(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = mem64[input_1 + 608] +# asm 1: movq 608(s=int64#6 +# asm 2: movq 608(s=%r9 +movq 608(%rsi),%r9 + +# qhasm: e = mem64[input_2 + 816] +# asm 1: movq 816(e=int64#7 +# asm 2: movq 816(e=%rax +movq 816(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 616(p=%rax +movq 616(%rsi),%rax + +# qhasm: e = mem64[input_2 + 824] +# asm 1: movq 824(e=int64#8 +# asm 2: movq 824(e=%r10 +movq 824(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and p=int64#7d +# asm 2: movl 624(p=%eax +movl 624(%rsi),%eax + +# qhasm: e = *(uint32 *)(input_2 + 832) +# asm 1: movl 832(e=int64#8d +# asm 2: movl 832(e=%r10d +movl 832(%rdx),%r10d + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 96(ss=%ymm0 +vmovupd 96(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 128(ss=%ymm0 +vmovupd 128(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#2 +# asm 2: vmovupd 128(ee=%ymm1 +vmovupd 128(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 160(ss=%ymm0 +vmovupd 160(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#2 +# asm 2: vmovupd 160(ee=%ymm1 +vmovupd 160(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor s=int64#2 +# asm 2: movq 192(s=%rsi +movq 192(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 192 ] +# asm 1: movq 192(e=int64#4 +# asm 2: movq 192(e=%rcx +movq 192(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 200(s=%rsi +movq 200(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 200 ] +# asm 1: movq 200(e=int64#3 +# asm 2: movq 200(e=%rdx +movq 200(%rdx),%rdx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE6688128_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece6688128/avx/update_asm.S b/crypto_kem/mceliece6688128/avx/update_asm.S new file mode 100644 index 00000000..b51f0940 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128_AVX_update_asm +.global PQCLEAN_MCELIECE6688128_AVX_update_asm +_PQCLEAN_MCELIECE6688128_AVX_update_asm: +PQCLEAN_MCELIECE6688128_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE6688128_AVX_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6688128_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6688128_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6688128_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6688128_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6688128_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6688128_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6688128_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE6688128_AVX_vec128_set2x( PQCLEAN_MCELIECE6688128_AVX_load8(in), PQCLEAN_MCELIECE6688128_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE6688128_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE6688128_AVX_store8(out + 0, PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE6688128_AVX_store8(out + 8, PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece6688128/avx/util.h b/crypto_kem/mceliece6688128/avx/util.h new file mode 100644 index 00000000..e138b7fa --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_UTIL_H +#define PQCLEAN_MCELIECE6688128_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE6688128_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE6688128_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE6688128_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE6688128_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE6688128_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE6688128_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE6688128_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE6688128_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE6688128_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece6688128/avx/vec128.c b/crypto_kem/mceliece6688128/avx/vec128.c new file mode 100644 index 00000000..cece2e8d --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE6688128_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE6688128_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE6688128_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/crypto_kem/mceliece6688128/avx/vec128.h b/crypto_kem/mceliece6688128/avx/vec128.h new file mode 100644 index 00000000..56d53de5 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_VEC128_H +#define PQCLEAN_MCELIECE6688128_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE6688128_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE6688128_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE6688128_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE6688128_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/crypto_kem/mceliece6688128/avx/vec128_mul_asm.S b/crypto_kem/mceliece6688128/avx/vec128_mul_asm.S new file mode 100644 index 00000000..fd1176c0 --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE6688128_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE6688128_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE6688128_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE6688128_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE6688128_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE6688128_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/crypto_kem/mceliece6688128/avx/vec256_ama_asm.S b/crypto_kem/mceliece6688128/avx/vec256_ama_asm.S new file mode 100644 index 00000000..9a93f3da --- /dev/null +++ b/crypto_kem/mceliece6688128/avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6688128_CLEAN_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6688128/clean/api.h b/crypto_kem/mceliece6688128/clean/api.h new file mode 100644 index 00000000..7d0c1bd8 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_API_H +#define PQCLEAN_MCELIECE6688128_CLEAN_API_H + +#include + +#define PQCLEAN_MCELIECE6688128_CLEAN_CRYPTO_ALGNAME "Classic McEliece 6688128" +#define PQCLEAN_MCELIECE6688128_CLEAN_CRYPTO_PUBLICKEYBYTES 1044992 +#define PQCLEAN_MCELIECE6688128_CLEAN_CRYPTO_SECRETKEYBYTES 13892 +#define PQCLEAN_MCELIECE6688128_CLEAN_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE6688128_CLEAN_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6688128_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6688128_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6688128_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece6688128/clean/benes.c b/crypto_kem/mceliece6688128/clean/benes.c new file mode 100644 index 00000000..ed227df4 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/benes.c @@ -0,0 +1,180 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6688128_CLEAN_apply_benes(unsigned char *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + unsigned char *r_ptr = r; + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = PQCLEAN_MCELIECE6688128_CLEAN_load8(r_ptr + i * 16 + 0); + r_int_v[1][i] = PQCLEAN_MCELIECE6688128_CLEAN_load8(r_ptr + i * 16 + 8); + } + + PQCLEAN_MCELIECE6688128_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6688128_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6688128_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6688128_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6688128_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE6688128_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6688128_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6688128_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6688128_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6688128_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE6688128_CLEAN_store8(r_ptr + i * 16 + 0, r_int_v[0][i]); + PQCLEAN_MCELIECE6688128_CLEAN_store8(r_ptr + i * 16 + 8, r_int_v[1][i]); + } +} + +/* input: condition bits c */ +/* output: support s */ +void PQCLEAN_MCELIECE6688128_CLEAN_support_gen(gf *s, const unsigned char *c) { + gf a; + int i, j; + unsigned char L[ GFBITS ][ (1 << GFBITS) / 8 ]; + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < (1 << GFBITS) / 8; j++) { + L[i][j] = 0; + } + } + + for (i = 0; i < (1 << GFBITS); i++) { + a = PQCLEAN_MCELIECE6688128_CLEAN_bitrev((gf) i); + + for (j = 0; j < GFBITS; j++) { + L[j][ i / 8 ] |= ((a >> j) & 1) << (i % 8); + } + } + + for (j = 0; j < GFBITS; j++) { + PQCLEAN_MCELIECE6688128_CLEAN_apply_benes(L[j], c, 0); + } + + for (i = 0; i < SYS_N; i++) { + s[i] = 0; + for (j = GFBITS - 1; j >= 0; j--) { + s[i] <<= 1; + s[i] |= (L[j][i / 8] >> (i % 8)) & 1; + } + } +} + diff --git a/crypto_kem/mceliece6688128/clean/benes.h b/crypto_kem/mceliece6688128/clean/benes.h new file mode 100644 index 00000000..86509a6a --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/benes.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_BENES_H +#define PQCLEAN_MCELIECE6688128_CLEAN_BENES_H +/* + This file is for Benes network related functions +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE6688128_CLEAN_apply_benes(uint8_t *r, const uint8_t *bits, int rev); +void PQCLEAN_MCELIECE6688128_CLEAN_support_gen(gf *s, const uint8_t *c); + +#endif + diff --git a/crypto_kem/mceliece6688128/clean/bm.c b/crypto_kem/mceliece6688128/clean/bm.c new file mode 100644 index 00000000..adae6976 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/bm.c @@ -0,0 +1,83 @@ +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ +#include "bm.h" + +#include "params.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +/* the Berlekamp-Massey algorithm */ +/* input: s, sequence of field elements */ +/* output: out, minimal polynomial of s */ +void PQCLEAN_MCELIECE6688128_CLEAN_bm(gf *out, gf *s) { + int i; + + uint16_t N = 0; + uint16_t L = 0; + uint16_t mle; + uint16_t mne; + + gf T[ SYS_T + 1 ]; + gf C[ SYS_T + 1 ]; + gf B[ SYS_T + 1 ]; + + gf b = 1, d, f; + + // + + for (i = 0; i < SYS_T + 1; i++) { + C[i] = B[i] = 0; + } + + B[1] = C[0] = 1; + + // + + for (N = 0; N < 2 * SYS_T; N++) { + d = 0; + + for (i = 0; i <= min(N, SYS_T); i++) { + d ^= PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(C[i], s[ N - i]); + } + + mne = d; + mne -= 1; + mne >>= 15; + mne -= 1; + mle = N; + mle -= 2 * L; + mle >>= 15; + mle -= 1; + mle &= mne; + + for (i = 0; i <= SYS_T; i++) { + T[i] = C[i]; + } + + f = PQCLEAN_MCELIECE6688128_CLEAN_gf_frac(b, d); + + for (i = 0; i <= SYS_T; i++) { + C[i] ^= PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(f, B[i]) & mne; + } + + L = (L & ~mle) | ((N + 1 - L) & mle); + + for (i = 0; i <= SYS_T; i++) { + B[i] = (B[i] & ~mle) | (T[i] & mle); + } + + b = (b & ~mle) | (d & mle); + + for (i = SYS_T; i >= 1; i--) { + B[i] = B[i - 1]; + } + B[0] = 0; + } + + for (i = 0; i <= SYS_T; i++) { + out[i] = C[ SYS_T - i ]; + } +} + diff --git a/crypto_kem/mceliece6688128/clean/bm.h b/crypto_kem/mceliece6688128/clean/bm.h new file mode 100644 index 00000000..185ad304 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/bm.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_BM_H +#define PQCLEAN_MCELIECE6688128_CLEAN_BM_H +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE6688128_CLEAN_bm(gf * /*out*/, gf * /*s*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/clean/controlbits.c b/crypto_kem/mceliece6688128/clean/controlbits.c new file mode 100644 index 00000000..7cfbe4de --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6688128_CLEAN_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6688128_CLEAN_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6688128_CLEAN_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6688128_CLEAN_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6688128/clean/controlbits.h b/crypto_kem/mceliece6688128/clean/controlbits.h new file mode 100644 index 00000000..2ff7f919 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_CONTROLBITS_H +#define PQCLEAN_MCELIECE6688128_CLEAN_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6688128_CLEAN_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6688128_CLEAN_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6688128/clean/crypto_hash.h b/crypto_kem/mceliece6688128/clean/crypto_hash.h new file mode 100644 index 00000000..d337b46c --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6688128_CLEAN_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6688128/clean/decrypt.c b/crypto_kem/mceliece6688128/clean/decrypt.c new file mode 100644 index 00000000..fb70223e --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/decrypt.c @@ -0,0 +1,90 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "gf.h" +#include "params.h" +#include "root.h" +#include "synd.h" +#include "util.h" + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6688128_CLEAN_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i, w = 0; + uint16_t check; + + unsigned char r[ SYS_N / 8 ]; + + gf g[ SYS_T + 1 ]; + gf L[ SYS_N ]; + + gf s[ SYS_T * 2 ]; + gf s_cmp[ SYS_T * 2 ]; + gf locator[ SYS_T + 1 ]; + gf images[ SYS_N ]; + + gf t; + + // + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = c[i]; + } + for (i = SYND_BYTES; i < SYS_N / 8; i++) { + r[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE6688128_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + g[ SYS_T ] = 1; + + PQCLEAN_MCELIECE6688128_CLEAN_support_gen(L, sk); + + PQCLEAN_MCELIECE6688128_CLEAN_synd(s, g, L, r); + + PQCLEAN_MCELIECE6688128_CLEAN_bm(locator, s); + + PQCLEAN_MCELIECE6688128_CLEAN_root(images, locator, L); + + // + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + } + + for (i = 0; i < SYS_N; i++) { + t = PQCLEAN_MCELIECE6688128_CLEAN_gf_iszero(images[i]) & 1; + + e[ i / 8 ] |= t << (i % 8); + w += t; + + } + + PQCLEAN_MCELIECE6688128_CLEAN_synd(s_cmp, g, L, e); + + // + + check = (uint16_t)w; + check ^= SYS_T; + + for (i = 0; i < SYS_T * 2; i++) { + check |= s[i] ^ s_cmp[i]; + } + + check -= 1; + check >>= 15; + + return check ^ 1; +} + diff --git a/crypto_kem/mceliece6688128/clean/decrypt.h b/crypto_kem/mceliece6688128/clean/decrypt.h new file mode 100644 index 00000000..3ac70a7d --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_DECRYPT_H +#define PQCLEAN_MCELIECE6688128_CLEAN_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6688128_CLEAN_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/clean/encrypt.c b/crypto_kem/mceliece6688128/clean/encrypt.c new file mode 100644 index 00000000..a4be56d6 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/encrypt.c @@ -0,0 +1,138 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include + +#include "gf.h" + +static inline uint8_t same_mask(uint16_t x, uint16_t y) { + uint32_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 31; + mask = -mask; + + return (uint8_t)mask; +} + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint8_t *ind_8 = (uint8_t *)ind_; + uint16_t ind[ SYS_T * 2 ]; + uint8_t mask; + unsigned char val[ SYS_T ]; + + while (1) { + randombytes(ind_8, sizeof(ind_)); + // Copy to uint16_t ind_ in a little-endian way + for (i = 0; i < sizeof(ind_); i += 2) { + ind_[i / 2] = ((uint16_t)ind_8[i + 1]) << 8 | (uint16_t)ind_8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = 1 << (ind[j] & 7); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = same_mask((uint16_t)i, (ind[j] >> 3)); + + e[i] |= val[j] & mask; + } + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + unsigned char b, row[SYS_N / 8]; + const unsigned char *pk_ptr = pk; + + int i, j; + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = 0; + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + row[j] = 0; + } + + for (j = 0; j < PK_ROW_BYTES; j++) { + row[ SYS_N / 8 - PK_ROW_BYTES + j ] = pk_ptr[j]; + } + + row[i / 8] |= 1 << (i % 8); + + b = 0; + for (j = 0; j < SYS_N / 8; j++) { + b ^= row[j] & e[j]; + } + + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] |= (b << (i % 8)); + + pk_ptr += PK_ROW_BYTES; + } +} + +void PQCLEAN_MCELIECE6688128_CLEAN_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece6688128/clean/encrypt.h b/crypto_kem/mceliece6688128/clean/encrypt.h new file mode 100644 index 00000000..4f37dd8f --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_ENCRYPT_H +#define PQCLEAN_MCELIECE6688128_CLEAN_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6688128_CLEAN_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/clean/gf.c b/crypto_kem/mceliece6688128/clean/gf.c new file mode 100644 index 00000000..ab215a3a --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/gf.c @@ -0,0 +1,210 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE6688128_CLEAN_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE6688128_CLEAN_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* input: field element in */ +/* return: (in^2)^2 */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: (in^2)*m */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: ((in^2)^2)*m */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE6688128_CLEAN_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +gf PQCLEAN_MCELIECE6688128_CLEAN_gf_inv(gf in) { + return PQCLEAN_MCELIECE6688128_CLEAN_gf_frac(in, ((gf) 1)); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE6688128_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(prod[i], (gf) 7682); + prod[i - SYS_T + 3] ^= PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(prod[i], (gf) 2159); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6688128/clean/gf.h b/crypto_kem/mceliece6688128/clean/gf.h new file mode 100644 index 00000000..a9398f89 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_GF_H +#define PQCLEAN_MCELIECE6688128_CLEAN_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6688128_CLEAN_gf_iszero(gf a); +gf PQCLEAN_MCELIECE6688128_CLEAN_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(gf in0, gf in1); +gf PQCLEAN_MCELIECE6688128_CLEAN_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE6688128_CLEAN_gf_inv(gf in); +uint64_t PQCLEAN_MCELIECE6688128_CLEAN_gf_mul2(gf a, gf b0, gf b1); + +void PQCLEAN_MCELIECE6688128_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece6688128/clean/operations.c b/crypto_kem/mceliece6688128/clean/operations.c new file mode 100644 index 00000000..bc0d5046 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6688128_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6688128_CLEAN_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6688128_CLEAN_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6688128_CLEAN_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6688128_CLEAN_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6688128_CLEAN_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6688128_CLEAN_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6688128_CLEAN_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6688128_CLEAN_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6688128_CLEAN_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6688128_CLEAN_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128/clean/params.h b/crypto_kem/mceliece6688128/clean/params.h new file mode 100644 index 00000000..77c1164e --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_PARAMS_H +#define PQCLEAN_MCELIECE6688128_CLEAN_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6688 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6688128/clean/pk_gen.c b/crypto_kem/mceliece6688128/clean/pk_gen.c new file mode 100644 index 00000000..f63778bc --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/pk_gen.c @@ -0,0 +1,144 @@ +/* + This file is for public-key generation +*/ + +#include + +#include "benes.h" +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "pk_gen.h" +#include "root.h" +#include "util.h" + +/* input: secret key sk */ +/* output: public key pk */ +int PQCLEAN_MCELIECE6688128_CLEAN_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) { + int i, j, k; + int row, c; + + uint64_t buf[ 1 << GFBITS ]; + + uint8_t mat[ GFBITS * SYS_T ][ SYS_N / 8 ]; + uint8_t mask; + uint8_t b; + + gf g[ SYS_T + 1 ]; // Goppa polynomial + gf L[ SYS_N ]; // support + gf inv[ SYS_N ]; + + // + + g[ SYS_T ] = 1; + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE6688128_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + + for (i = 0; i < (1 << GFBITS); i++) { + buf[i] = perm[i]; + buf[i] <<= 31; + buf[i] |= i; + } + + PQCLEAN_MCELIECE6688128_CLEAN_sort_63b(1 << GFBITS, buf); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = buf[i] & GFMASK; + } + for (i = 0; i < SYS_N; i++) { + L[i] = PQCLEAN_MCELIECE6688128_CLEAN_bitrev((gf)perm[i]); + } + + // filling the matrix + + PQCLEAN_MCELIECE6688128_CLEAN_root(inv, g, L); + + for (i = 0; i < SYS_N; i++) { + inv[i] = PQCLEAN_MCELIECE6688128_CLEAN_gf_inv(inv[i]); + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + mat[i][j] = 0; + } + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_N; j += 8) { + for (k = 0; k < GFBITS; k++) { + b = (inv[j + 7] >> k) & 1; + b <<= 1; + b |= (inv[j + 6] >> k) & 1; + b <<= 1; + b |= (inv[j + 5] >> k) & 1; + b <<= 1; + b |= (inv[j + 4] >> k) & 1; + b <<= 1; + b |= (inv[j + 3] >> k) & 1; + b <<= 1; + b |= (inv[j + 2] >> k) & 1; + b <<= 1; + b |= (inv[j + 1] >> k) & 1; + b <<= 1; + b |= (inv[j + 0] >> k) & 1; + + mat[ i * GFBITS + k ][ j / 8 ] = b; + } + } + + for (j = 0; j < SYS_N; j++) { + inv[j] = PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(inv[j], L[j]); + } + + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) { + for (j = 0; j < 8; j++) { + row = i * 8 + j; + + if (row >= GFBITS * SYS_T) { + break; + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] ^ mat[ k ][ i ]; + mask >>= j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < GFBITS * SYS_T; k++) { + if (k != row) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + } + + for (i = 0; i < PK_NROWS; i++) { + memcpy(pk + i * PK_ROW_BYTES, mat[i] + PK_NROWS / 8, PK_ROW_BYTES); + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128/clean/pk_gen.h b/crypto_kem/mceliece6688128/clean/pk_gen.h new file mode 100644 index 00000000..b26bb215 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_PK_GEN_H +#define PQCLEAN_MCELIECE6688128_CLEAN_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE6688128_CLEAN_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/clean/root.c b/crypto_kem/mceliece6688128/clean/root.c new file mode 100644 index 00000000..80648bb7 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/root.c @@ -0,0 +1,33 @@ +/* + This file is for evaluating a polynomial at one or more field elements +*/ +#include "root.h" + +#include "params.h" + +/* input: polynomial f and field element a */ +/* return f(a) */ +gf PQCLEAN_MCELIECE6688128_CLEAN_eval(gf *f, gf a) { + int i; + gf r; + + r = f[ SYS_T ]; + + for (i = SYS_T - 1; i >= 0; i--) { + r = PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(r, a); + r = PQCLEAN_MCELIECE6688128_CLEAN_gf_add(r, f[i]); + } + + return r; +} + +/* input: polynomial f and list of field elements L */ +/* output: out = [ f(a) for a in L ] */ +void PQCLEAN_MCELIECE6688128_CLEAN_root(gf *out, gf *f, gf *L) { + int i; + + for (i = 0; i < SYS_N; i++) { + out[i] = PQCLEAN_MCELIECE6688128_CLEAN_eval(f, L[i]); + } +} + diff --git a/crypto_kem/mceliece6688128/clean/root.h b/crypto_kem/mceliece6688128/clean/root.h new file mode 100644 index 00000000..ad5854c6 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/root.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_ROOT_H +#define PQCLEAN_MCELIECE6688128_CLEAN_ROOT_H +/* + This file is for evaluating a polynomial at one or more field elements +*/ + + +#include "gf.h" + +gf PQCLEAN_MCELIECE6688128_CLEAN_eval(gf * /*f*/, gf /*a*/); +void PQCLEAN_MCELIECE6688128_CLEAN_root(gf * /*out*/, gf * /*f*/, gf * /*L*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/clean/sk_gen.c b/crypto_kem/mceliece6688128/clean/sk_gen.c new file mode 100644 index 00000000..186c470e --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6688128_CLEAN_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6688128_CLEAN_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6688128_CLEAN_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6688128_CLEAN_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6688128_CLEAN_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6688128_CLEAN_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128/clean/sk_gen.h b/crypto_kem/mceliece6688128/clean/sk_gen.h new file mode 100644 index 00000000..14f9b3e4 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_SK_GEN_H +#define PQCLEAN_MCELIECE6688128_CLEAN_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6688128_CLEAN_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6688128_CLEAN_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/clean/synd.c b/crypto_kem/mceliece6688128/clean/synd.c new file mode 100644 index 00000000..0fbff85d --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/synd.c @@ -0,0 +1,33 @@ +/* + This file is for syndrome computation +*/ + +#include "synd.h" + +#include "params.h" +#include "root.h" + + +/* input: Goppa polynomial f, support L, received word r */ +/* output: out, the syndrome of length 2t */ +void PQCLEAN_MCELIECE6688128_CLEAN_synd(gf *out, gf *f, gf *L, const unsigned char *r) { + int i, j; + gf e, e_inv, c; + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = 0; + } + + for (i = 0; i < SYS_N; i++) { + c = (r[i / 8] >> (i % 8)) & 1; + + e = PQCLEAN_MCELIECE6688128_CLEAN_eval(f, L[i]); + e_inv = PQCLEAN_MCELIECE6688128_CLEAN_gf_inv(PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(e, e)); + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = PQCLEAN_MCELIECE6688128_CLEAN_gf_add(out[j], PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(e_inv, c)); + e_inv = PQCLEAN_MCELIECE6688128_CLEAN_gf_mul(e_inv, L[i]); + } + } +} + diff --git a/crypto_kem/mceliece6688128/clean/synd.h b/crypto_kem/mceliece6688128/clean/synd.h new file mode 100644 index 00000000..3e06d7cc --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/synd.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_SYND_H +#define PQCLEAN_MCELIECE6688128_CLEAN_SYND_H +/* + This file is for syndrome computation +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE6688128_CLEAN_synd(gf * /*out*/, gf * /*f*/, gf * /*L*/, const unsigned char * /*r*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/clean/transpose.c b/crypto_kem/mceliece6688128/clean/transpose.c new file mode 100644 index 00000000..185d9056 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/transpose.c @@ -0,0 +1,42 @@ +/* + This file is for matrix transposition +*/ + +#include "transpose.h" + +#include + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE6688128_CLEAN_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + diff --git a/crypto_kem/mceliece6688128/clean/transpose.h b/crypto_kem/mceliece6688128/clean/transpose.h new file mode 100644 index 00000000..496c43df --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/transpose.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_TRANSPOSE_H +#define PQCLEAN_MCELIECE6688128_CLEAN_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + +void PQCLEAN_MCELIECE6688128_CLEAN_transpose_64x64(uint64_t * /*out*/, const uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/clean/util.c b/crypto_kem/mceliece6688128/clean/util.c new file mode 100644 index 00000000..4046d92a --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/util.c @@ -0,0 +1,67 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ + +#include "util.h" + +#include "params.h" + +void PQCLEAN_MCELIECE6688128_CLEAN_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6688128_CLEAN_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6688128_CLEAN_load4(const unsigned char *in) { + int i; + uint32_t ret = in[3]; + + for (i = 2; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +void PQCLEAN_MCELIECE6688128_CLEAN_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6688128_CLEAN_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE6688128_CLEAN_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 3; +} + diff --git a/crypto_kem/mceliece6688128/clean/util.h b/crypto_kem/mceliece6688128/clean/util.h new file mode 100644 index 00000000..2dcbad48 --- /dev/null +++ b/crypto_kem/mceliece6688128/clean/util.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6688128_CLEAN_UTIL_H +#define PQCLEAN_MCELIECE6688128_CLEAN_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include + +void PQCLEAN_MCELIECE6688128_CLEAN_store2(unsigned char * /*dest*/, gf /*a*/); +uint16_t PQCLEAN_MCELIECE6688128_CLEAN_load2(const unsigned char * /*src*/); + +uint32_t PQCLEAN_MCELIECE6688128_CLEAN_load4(const unsigned char * /*in*/); + +void PQCLEAN_MCELIECE6688128_CLEAN_store8(unsigned char * /*out*/, uint64_t /*in*/); +uint64_t PQCLEAN_MCELIECE6688128_CLEAN_load8(const unsigned char * /*in*/); + +gf PQCLEAN_MCELIECE6688128_CLEAN_bitrev(gf /*a*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/sse/LICENSE b/crypto_kem/mceliece6688128/sse/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece6688128/sse/Makefile b/crypto_kem/mceliece6688128/sse/Makefile new file mode 100644 index 00000000..4c56ecea --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/Makefile @@ -0,0 +1,37 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece6688128_sse.a + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c operations.c pk_gen.c sk_gen.c util.c vec128.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S update_asm.S \ + vec128_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h params.h \ + pk_gen.h sk_gen.h transpose.h util.h vec128.h \ + consts.inc scalars_2x.inc scalars_4x.inc powers.inc + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o operations.o pk_gen.o sk_gen.o util.o vec128.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + update_asm.o vec128_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mpopcnt -msse4.1 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece6688128/sse/aes256ctr.c b/crypto_kem/mceliece6688128/sse/aes256ctr.c new file mode 100644 index 00000000..a3cf1fdd --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE6688128_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece6688128/sse/aes256ctr.h b/crypto_kem/mceliece6688128/sse/aes256ctr.h new file mode 100644 index 00000000..eb8ef292 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_AES256CTR_H +#define PQCLEAN_MCELIECE6688128_SSE_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6688128_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6688128/sse/api.h b/crypto_kem/mceliece6688128/sse/api.h new file mode 100644 index 00000000..a1e593a2 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_API_H +#define PQCLEAN_MCELIECE6688128_SSE_API_H + +#include + +#define PQCLEAN_MCELIECE6688128_SSE_CRYPTO_ALGNAME "Classic McEliece 6688128" +#define PQCLEAN_MCELIECE6688128_SSE_CRYPTO_PUBLICKEYBYTES 1044992 +#define PQCLEAN_MCELIECE6688128_SSE_CRYPTO_SECRETKEYBYTES 13892 +#define PQCLEAN_MCELIECE6688128_SSE_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE6688128_SSE_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6688128_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6688128_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6688128_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece6688128/sse/benes.c b/crypto_kem/mceliece6688128/sse/benes.c new file mode 100644 index 00000000..1e13733d --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE6688128_SSE_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE6688128_SSE_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(PQCLEAN_MCELIECE6688128_SSE_load8(ptr), PQCLEAN_MCELIECE6688128_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6688128_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE6688128_SSE_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(PQCLEAN_MCELIECE6688128_SSE_load8(ptr), PQCLEAN_MCELIECE6688128_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6688128_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6688128_SSE_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE6688128_SSE_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6688128_SSE_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6688128_SSE_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE6688128_SSE_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece6688128/sse/benes.h b/crypto_kem/mceliece6688128/sse/benes.h new file mode 100644 index 00000000..4561e5ac --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_BENES_H +#define PQCLEAN_MCELIECE6688128_SSE_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE6688128_SSE_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE6688128_SSE_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/sse/bm.c b/crypto_kem/mceliece6688128/sse/bm.c new file mode 100644 index 00000000..f5cf6e93 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/bm.c @@ -0,0 +1,208 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +extern gf PQCLEAN_MCELIECE6688128_SSE_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE6688128_SSE_update_asm(vec128 *, gf); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 *out, vec128 *in, uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE6688128_SSE_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE6688128_SSE_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6688128_SSE_vec128_and(in[i], m0); + v1 = PQCLEAN_MCELIECE6688128_SSE_vec128_and(out[i], m1); + out[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_or(v0, v1); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE6688128_SSE_vec128_or(PQCLEAN_MCELIECE6688128_SSE_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE6688128_SSE_vec128_sll_2x(PQCLEAN_MCELIECE6688128_SSE_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE6688128_SSE_vec128_or(PQCLEAN_MCELIECE6688128_SSE_vec128_srl_2x(PQCLEAN_MCELIECE6688128_SSE_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE6688128_SSE_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE6688128_SSE_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6688128_SSE_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6688128_SSE_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6688128_SSE_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6688128_SSE_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6688128_SSE_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6688128_SSE_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6688128_SSE_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_SSE_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_SSE_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6688128_SSE_bm(vec128 *out, vec128 in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + vec128 dd[ GFBITS ], bb[ GFBITS ]; + vec128 B[ GFBITS ], C[ GFBITS ]; + vec128 B_tmp[ GFBITS ], C_tmp[ GFBITS ]; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[128], in[1]); + + C[0] = PQCLEAN_MCELIECE6688128_SSE_vec128_setzero(); + B[0] = PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0, one << 63); + + for (i = 1; i < GFBITS; i++) { + C[i] = B[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_setzero(); + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(prod, C, (vec128 *) interval); + PQCLEAN_MCELIECE6688128_SSE_update_asm(interval, coefs[N]); + d = PQCLEAN_MCELIECE6688128_SSE_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE6688128_SSE_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_setbits((d >> i) & 1); + bb[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(B_tmp, dd, B); + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(C_tmp, bb, C); + + vec128_cmov(B, C, mask); + PQCLEAN_MCELIECE6688128_SSE_update_asm(B, c0 & mask); + + for (i = 0; i < GFBITS; i++) { + C[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(B_tmp[i], C_tmp[i]); + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE6688128_SSE_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + out[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(out, out, C); +} + diff --git a/crypto_kem/mceliece6688128/sse/bm.h b/crypto_kem/mceliece6688128/sse/bm.h new file mode 100644 index 00000000..738bca92 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/bm.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_BM_H +#define PQCLEAN_MCELIECE6688128_SSE_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE6688128_SSE_bm(vec128 * /*out*/, vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6688128/sse/consts.S b/crypto_kem/mceliece6688128/sse/consts.S new file mode 100644 index 00000000..d5b5d790 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/consts.S @@ -0,0 +1,32 @@ +.data + +# not supported on macos +#.section .rodata +.globl PQCLEAN_MCELIECE6688128_SSE_MASK0_0 +.globl PQCLEAN_MCELIECE6688128_SSE_MASK0_1 +.globl PQCLEAN_MCELIECE6688128_SSE_MASK1_0 +.globl PQCLEAN_MCELIECE6688128_SSE_MASK1_1 +.globl PQCLEAN_MCELIECE6688128_SSE_MASK2_0 +.globl PQCLEAN_MCELIECE6688128_SSE_MASK2_1 +.globl PQCLEAN_MCELIECE6688128_SSE_MASK3_0 +.globl PQCLEAN_MCELIECE6688128_SSE_MASK3_1 +.globl PQCLEAN_MCELIECE6688128_SSE_MASK4_0 +.globl PQCLEAN_MCELIECE6688128_SSE_MASK4_1 +.globl PQCLEAN_MCELIECE6688128_SSE_MASK5_0 +.globl PQCLEAN_MCELIECE6688128_SSE_MASK5_1 + +.p2align 4 + +PQCLEAN_MCELIECE6688128_SSE_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE6688128_SSE_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE6688128_SSE_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE6688128_SSE_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE6688128_SSE_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE6688128_SSE_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE6688128_SSE_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE6688128_SSE_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE6688128_SSE_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE6688128_SSE_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE6688128_SSE_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE6688128_SSE_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece6688128/sse/consts.inc b/crypto_kem/mceliece6688128/sse/consts.inc new file mode 100644 index 00000000..88c74297 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/consts.inc @@ -0,0 +1,967 @@ +// 64 +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +// 128 +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), +}, +// 256 +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9669966969966996, 0X6996699696699669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +// 512 +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +// 1024 +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +// 2048 +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +// 4096 +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +} diff --git a/crypto_kem/mceliece6688128/sse/controlbits.c b/crypto_kem/mceliece6688128/sse/controlbits.c new file mode 100644 index 00000000..46bdc1ba --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6688128_SSE_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6688128_SSE_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6688128_SSE_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6688128_SSE_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6688128/sse/controlbits.h b/crypto_kem/mceliece6688128/sse/controlbits.h new file mode 100644 index 00000000..a0b72616 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_CONTROLBITS_H +#define PQCLEAN_MCELIECE6688128_SSE_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6688128_SSE_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6688128_SSE_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6688128/sse/crypto_hash.h b/crypto_kem/mceliece6688128/sse/crypto_hash.h new file mode 100644 index 00000000..b5846c60 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6688128_SSE_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6688128/sse/decrypt.c b/crypto_kem/mceliece6688128/sse/decrypt.c new file mode 100644 index 00000000..1a28a407 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/decrypt.c @@ -0,0 +1,204 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec128 out[][GFBITS], vec128 inv[][GFBITS], const unsigned char *sk, vec128 *recv) { + int i, j; + + vec128 irr_int[ GFBITS ]; + vec128 eval[64][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE6688128_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6688128_SSE_fft(eval, irr_int); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE6688128_SSE_vec128_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6688128_SSE_vec128_copy(inv[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128_SSE_vec128_inv(tmp, inv[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128_SSE_vec128_copy(inv[0], tmp); + + // + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6688128_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE6688128_SSE_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE6688128_SSE_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE6688128_SSE_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec128 out[][GFBITS], vec128 inv[][GFBITS], vec128 *recv) { + int i, j; + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6688128_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6688128_SSE_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6688128_SSE_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec128 s0[][ GFBITS ], vec128 s1[][ GFBITS ]) { + int i, j; + vec128 diff; + + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_or(PQCLEAN_MCELIECE6688128_SSE_vec128_xor(s0[0][0], s1[0][0]), + PQCLEAN_MCELIECE6688128_SSE_vec128_xor(s0[1][0], s1[1][0])); + + for (i = 0; i < 2; i++) { + for (j = 1; j < GFBITS; j++) { + diff = PQCLEAN_MCELIECE6688128_SSE_vec128_or(diff, PQCLEAN_MCELIECE6688128_SSE_vec128_xor(s0[i][j], s1[i][j])); + } + } + + return (uint16_t)PQCLEAN_MCELIECE6688128_SSE_vec128_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6688128_SSE_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec128 inv[ 64 ][ GFBITS ]; + vec128 scaled[ 64 ][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + + vec128 error[ 64 ]; + + vec128 s_priv[ 2 ][ GFBITS ]; + vec128 s_priv_cmp[ 2 ][ GFBITS ]; + + vec128 locator[ GFBITS ]; + + vec128 recv[ 64 ]; + vec128 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE6688128_SSE_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE6688128_SSE_benes(recv, bits_int, 1); + + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE6688128_SSE_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6688128_SSE_bm(locator, s_priv); + + PQCLEAN_MCELIECE6688128_SSE_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6688128_SSE_vec128_setbits(1); + + for (i = 0; i < 64; i++) { + error[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_or_reduce(eval[i]); + error[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(error[i], allone); + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE6688128_SSE_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE6688128_SSE_benes(error, bits_int, 0); + + postprocess(e, error); + + check_weight = weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece6688128/sse/decrypt.h b/crypto_kem/mceliece6688128/sse/decrypt.h new file mode 100644 index 00000000..c7a27be5 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_DECRYPT_H +#define PQCLEAN_MCELIECE6688128_SSE_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6688128_SSE_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/sse/encrypt.c b/crypto_kem/mceliece6688128/sse/encrypt.c new file mode 100644 index 00000000..261322a9 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/encrypt.c @@ -0,0 +1,105 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE6688128_SSE_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind[ SYS_T * 2 ]; + uint32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *)ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind32[i] == ind32[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6688128_SSE_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6688128_SSE_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + PQCLEAN_MCELIECE6688128_SSE_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece6688128/sse/encrypt.h b/crypto_kem/mceliece6688128/sse/encrypt.h new file mode 100644 index 00000000..0c7be6e2 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_ENCRYPT_H +#define PQCLEAN_MCELIECE6688128_SSE_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6688128_SSE_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/sse/fft.c b/crypto_kem/mceliece6688128/sse/fft.c new file mode 100644 index 00000000..f53124ac --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/fft.c @@ -0,0 +1,243 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec128.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE6688128_SSE_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE6688128_SSE_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(in, in, s[j]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec128 out[][ GFBITS ], const vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec128 t[ GFBITS ]; + vec128 pre[8][ GFBITS ]; + vec128 buf[64]; + + uint64_t v0, v1; + uint64_t consts_ptr = 1; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec128 powers[ 64 ][ GFBITS ] = { +#include "powers.inc" + }; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre[i + 0][j] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_low(tmp[j], tmp[j]); + pre[i + 1][j] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(PQCLEAN_MCELIECE6688128_SSE_vec128_extract(in[i], 0), + PQCLEAN_MCELIECE6688128_SSE_vec128_extract(in[i], 0) ^ PQCLEAN_MCELIECE6688128_SSE_vec128_extract(pre[6][i], 0)); + + buf[1] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[0], pre[0][i]); + buf[16] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[0], pre[4][i]); + buf[3] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[1], pre[1][i]); + buf[48] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[16], pre[5][i]); + buf[49] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[48], pre[0][i]); + buf[2] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[0], pre[1][i]); + buf[51] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[49], pre[1][i]); + buf[6] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[2], pre[2][i]); + buf[50] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[51], pre[0][i]); + buf[7] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[6], pre[0][i]); + buf[54] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[50], pre[2][i]); + buf[5] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[7], pre[1][i]); + buf[55] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[54], pre[0][i]); + buf[53] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[55], pre[1][i]); + buf[4] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[0], pre[2][i]); + buf[52] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[53], pre[0][i]); + buf[12] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[4], pre[3][i]); + buf[60] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[52], pre[3][i]); + buf[13] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[12], pre[0][i]); + buf[61] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[60], pre[0][i]); + buf[15] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[13], pre[1][i]); + buf[63] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[61], pre[1][i]); + buf[14] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[15], pre[0][i]); + buf[62] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[63], pre[0][i]); + buf[10] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[14], pre[2][i]); + buf[58] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[62], pre[2][i]); + buf[11] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[10], pre[0][i]); + buf[59] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[58], pre[0][i]); + buf[9] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[11], pre[1][i]); + buf[57] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[59], pre[1][i]); + buf[56] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[57], pre[0][i]); + buf[8] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[0], pre[3][i]); + buf[40] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[56], pre[4][i]); + buf[24] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[8], pre[4][i]); + buf[41] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[40], pre[0][i]); + buf[25] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[24], pre[0][i]); + buf[43] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[41], pre[1][i]); + buf[27] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[25], pre[1][i]); + buf[42] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[43], pre[0][i]); + buf[26] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[27], pre[0][i]); + buf[46] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[42], pre[2][i]); + buf[30] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[26], pre[2][i]); + buf[47] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[46], pre[0][i]); + buf[31] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[30], pre[0][i]); + buf[45] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[47], pre[1][i]); + buf[29] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[31], pre[1][i]); + buf[44] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[45], pre[0][i]); + buf[28] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[29], pre[0][i]); + buf[36] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[44], pre[3][i]); + buf[20] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[28], pre[3][i]); + buf[37] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[36], pre[0][i]); + buf[21] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[20], pre[0][i]); + buf[39] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[37], pre[1][i]); + buf[23] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[21], pre[1][i]); + buf[38] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[39], pre[0][i]); + buf[22] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[23], pre[0][i]); + buf[34] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[38], pre[2][i]); + buf[18] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[22], pre[2][i]); + buf[35] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[34], pre[0][i]); + buf[19] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[18], pre[0][i]); + buf[33] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[35], pre[1][i]); + buf[17] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[19], pre[1][i]); + buf[32] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[33], pre[0][i]); + + PQCLEAN_MCELIECE6688128_SSE_transpose_64x128_sp(buf); + + for (j = 0; j < 64; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 0; i <= 5; i++) { + s = 1 << i; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(tmp, out[k + s], (vec128 *) consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(out[k ][b], tmp[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(out[k + s][b], out[k][b]); + } + } + } + + consts_ptr += (1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 64; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(out[i][b], powers[i][b]); + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6688128_SSE_fft(vec128 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece6688128/sse/fft.h b/crypto_kem/mceliece6688128/sse/fft.h new file mode 100644 index 00000000..894d7ebb --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_FFT_H +#define PQCLEAN_MCELIECE6688128_SSE_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include + +void PQCLEAN_MCELIECE6688128_SSE_fft(vec128 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/sse/fft_tr.c b/crypto_kem/mceliece6688128/sse/fft_tr.c new file mode 100644 index 00000000..c572efb1 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/fft_tr.c @@ -0,0 +1,338 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec128 in[][ GFBITS ]) { + int i, j, k; + vec128 t, x0, x1; + + uint64_t v0, v1, v2, v3; + + const vec128 mask[6][2] = { + { + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec128 s[6][2][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(in[1], in[1], s[j][1]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE6688128_SSE_vec128_and(in[0][i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE6688128_SSE_vec128_and(in[0][i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE6688128_SSE_vec128_and(in[1][i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[1][i], t); + + t = PQCLEAN_MCELIECE6688128_SSE_vec128_and(in[1][i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[1][i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + x0 = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_low(in[0][i], in[1][i]); + x1 = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_high(in[0][i], in[1][i]); + + x1 = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(x1, PQCLEAN_MCELIECE6688128_SSE_vec128_srl_2x(x0, 32)); + x1 = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(x1, PQCLEAN_MCELIECE6688128_SSE_vec128_sll_2x(x1, 32)); + + in[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_low(x0, x1); + in[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_high(x0, x1); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(in[0][i], 0); + v1 = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(in[0][i], 1); + v2 = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(in[1][i], 0); + v3 = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(in[1][i], 1); + + v3 ^= v2 ^= v1; + + in[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(v0, v1); + in[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(v2, v3); + } + + } +} + +static void butterflies_tr(vec128 out[][ GFBITS ], vec128 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec128 tmp0[ GFBITS ]; + vec128 tmp1[ GFBITS ]; + vec128 tmp[ GFBITS ]; + + vec128 pre[ 6 ][ GFBITS ]; + vec128 buf[ 64 ]; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 64; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 5; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[k][b], in[k + s][b]); + } + + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[k + s][b], tmp[b]); + } + } + } + } + + for (j = 0; j < 64; j += 2) { + for (i = 0; i < GFBITS; i++) { + tmp0[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_low(in[j][i], in[j + 1][i]); + tmp1[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_high(in[j][i], in[j + 1][i]); + } + + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(tmp0[b], tmp1[b]); + } + + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(tmp, tmp0, consts[0]); + + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(tmp1[b], tmp[b]); + } + + for (i = 0; i < GFBITS; i++) { + in[j + 0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_low(tmp0[i], tmp1[i]); + in[j + 1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_high(tmp0[i], tmp1[i]); + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 64; k++) { + buf[k] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE6688128_SSE_transpose_64x128_sp(buf); + + pre[0][i] = buf[32]; + buf[33] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[33], buf[32]); + pre[1][i] = buf[33]; + buf[35] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[35], buf[33]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[35]); + buf[34] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[34], buf[35]); + pre[2][i] = buf[34]; + buf[38] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[38], buf[34]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[38]); + buf[39] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[39], buf[38]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[39]); + buf[37] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[37], buf[39]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[37]); + buf[36] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[36], buf[37]); + pre[3][i] = buf[36]; + buf[44] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[44], buf[36]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[44]); + buf[45] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[45], buf[44]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[45]); + buf[47] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[47], buf[45]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[47]); + buf[46] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[46], buf[47]); + pre[2][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[2][i], buf[46]); + buf[42] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[42], buf[46]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[42]); + buf[43] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[43], buf[42]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[43]); + buf[41] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[41], buf[43]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[41]); + buf[40] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[40], buf[41]); + pre[4][i] = buf[40]; + buf[56] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[56], buf[40]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[56]); + buf[57] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[57], buf[56]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[57]); + buf[59] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[59], buf[57]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[59]); + buf[58] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[58], buf[59]); + pre[2][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[2][i], buf[58]); + buf[62] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[62], buf[58]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[62]); + buf[63] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[63], buf[62]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[63]); + buf[61] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[61], buf[63]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[61]); + buf[60] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[60], buf[61]); + pre[3][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[3][i], buf[60]); + buf[52] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[52], buf[60]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[52]); + buf[53] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[53], buf[52]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[53]); + buf[55] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[55], buf[53]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[55]); + buf[54] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[54], buf[55]); + pre[2][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[2][i], buf[54]); + buf[50] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[50], buf[54]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[50]); + buf[51] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[51], buf[50]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[51]); + buf[49] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[49], buf[51]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[49]); + buf[48] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[48], buf[49]); + pre[5][i] = buf[48]; + buf[16] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[16], buf[48]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[16]); + buf[17] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[17], buf[16]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[17]); + buf[19] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[19], buf[17]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[19]); + buf[18] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[18], buf[19]); + pre[2][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[2][i], buf[18]); + buf[22] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[22], buf[18]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[22]); + buf[23] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[23], buf[22]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[23]); + buf[21] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[21], buf[23]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[21]); + buf[20] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[20], buf[21]); + pre[3][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[3][i], buf[20]); + buf[28] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[28], buf[20]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[28]); + buf[29] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[29], buf[28]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[29]); + buf[31] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[31], buf[29]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[31]); + buf[30] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[30], buf[31]); + pre[2][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[2][i], buf[30]); + buf[26] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[26], buf[30]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[26]); + buf[27] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[27], buf[26]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[27]); + buf[25] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[25], buf[27]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[25]); + buf[24] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[24], buf[25]); + pre[4][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[4][i], buf[24]); + buf[8] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[8], buf[24]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[8]); + buf[9] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[9], buf[8]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[9]); + buf[11] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[11], buf[9]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[11]); + buf[10] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[10], buf[11]); + pre[2][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[2][i], buf[10]); + buf[14] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[14], buf[10]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[14]); + buf[15] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[15], buf[14]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[15]); + buf[13] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[13], buf[15]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[13]); + buf[12] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[12], buf[13]); + pre[3][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[3][i], buf[12]); + buf[4] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[4], buf[12]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[4]); + buf[5] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[5], buf[4]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[5]); + buf[7] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[7], buf[5]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[7]); + buf[6] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[6], buf[7]); + pre[2][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[2][i], buf[6]); + buf[2] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[2], buf[6]); + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[2]); + buf[3] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[3], buf[2]); + pre[1][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[1][i], buf[3]); + buf[1] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[1], buf[3]); + + pre[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(pre[0][i], buf[1]); + out[0][i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(buf[0], buf[1]); + + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128_SSE_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(out[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128_SSE_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(tmp, pre[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out[1][b] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(out[1][b], tmp[b]); + } + } +} + +void PQCLEAN_MCELIECE6688128_SSE_fft_tr(vec128 out[][GFBITS], vec128 in[][ GFBITS ]) { + butterflies_tr(out, in); + + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece6688128/sse/fft_tr.h b/crypto_kem/mceliece6688128/sse/fft_tr.h new file mode 100644 index 00000000..3af229d7 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_FFT_TR_H +#define PQCLEAN_MCELIECE6688128_SSE_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE6688128_SSE_fft_tr(vec128 /*out*/[][GFBITS], vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6688128/sse/gf.c b/crypto_kem/mceliece6688128/sse/gf.c new file mode 100644 index 00000000..51394a40 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* field multiplication */ +gf PQCLEAN_MCELIECE6688128_SSE_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE6688128_SSE_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6688128_SSE_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6688128_SSE_gf_inv(gf in) { + return PQCLEAN_MCELIECE6688128_SSE_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6688128_SSE_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6688128_SSE_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6688128_SSE_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE6688128_SSE_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE6688128_SSE_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE6688128_SSE_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6688128/sse/gf.h b/crypto_kem/mceliece6688128/sse/gf.h new file mode 100644 index 00000000..3da367f6 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/gf.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_GF_H +#define PQCLEAN_MCELIECE6688128_SSE_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6688128_SSE_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE6688128_SSE_gf_mul(gf /*in0*/, gf /*in1*/); +uint64_t PQCLEAN_MCELIECE6688128_SSE_gf_mul2(gf /*a*/, gf /*b0*/, gf /*b1*/); +gf PQCLEAN_MCELIECE6688128_SSE_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE6688128_SSE_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE6688128_SSE_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/sse/operations.c b/crypto_kem/mceliece6688128/sse/operations.c new file mode 100644 index 00000000..61adb1df --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6688128_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6688128_SSE_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6688128_SSE_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6688128_SSE_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6688128_SSE_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6688128_SSE_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6688128_SSE_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6688128_SSE_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6688128_SSE_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6688128_SSE_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6688128_SSE_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128/sse/params.h b/crypto_kem/mceliece6688128/sse/params.h new file mode 100644 index 00000000..806c4fc3 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_PARAMS_H +#define PQCLEAN_MCELIECE6688128_SSE_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6688 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6688128/sse/pk_gen.c b/crypto_kem/mceliece6688128/sse/pk_gen.c new file mode 100644 index 00000000..f309af05 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/pk_gen.c @@ -0,0 +1,258 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec128 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 128 + 0 * 64 + r] <<= 1; + out[i * 128 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 128 + 1 * 64 + r] <<= 1; + out[i * 128 + 1 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec128 out0[][GFBITS], vec128 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[2] = {0}; + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(u[0], u[1]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(u[0], u[1]); + } + } +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 127) / 128) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +#define NBLOCKS2_I ((GFBITS * SYS_T + 127) / 128) +int PQCLEAN_MCELIECE6688128_SSE_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS1_I; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 2 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS1_I ]; + + uint64_t mask; + + vec128 irr_int[ GFBITS ]; + + vec128 consts[64][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + vec128 prod[ 64 ][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE6688128_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6688128_SSE_fft(eval, irr_int); + + PQCLEAN_MCELIECE6688128_SSE_vec128_copy(prod[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128_SSE_vec128_inv(tmp, prod[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128_SSE_vec128_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6688128_SSE_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_I; j++) { + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS1_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE6688128_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = 0; k < NBLOCKS1_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + PQCLEAN_MCELIECE6688128_SSE_store8(pk, one_row[k]); + pk += 8; + } + + PQCLEAN_MCELIECE6688128_SSE_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece6688128/sse/pk_gen.h b/crypto_kem/mceliece6688128/sse/pk_gen.h new file mode 100644 index 00000000..fd1e50bc --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_PK_GEN_H +#define PQCLEAN_MCELIECE6688128_SSE_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6688128_SSE_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/sse/powers.inc b/crypto_kem/mceliece6688128/sse/powers.inc new file mode 100644 index 00000000..f92af938 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/powers.inc @@ -0,0 +1,960 @@ +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +} diff --git a/crypto_kem/mceliece6688128/sse/scalars_2x.inc b/crypto_kem/mceliece6688128/sse/scalars_2x.inc new file mode 100644 index 00000000..56f9cbd1 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece6688128/sse/scalars_4x.inc b/crypto_kem/mceliece6688128/sse/scalars_4x.inc new file mode 100644 index 00000000..72153ec1 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/scalars_4x.inc @@ -0,0 +1,181 @@ +{{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF), +}}, +{{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FF00000000FF, 0X0000FF000000FFFF), +}}, +{{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), +}}, +{{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), +}} + diff --git a/crypto_kem/mceliece6688128/sse/sk_gen.c b/crypto_kem/mceliece6688128/sse/sk_gen.c new file mode 100644 index 00000000..5f6e099d --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6688128_SSE_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6688128_SSE_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6688128_SSE_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6688128_SSE_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6688128_SSE_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6688128_SSE_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6688128_SSE_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6688128_SSE_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128/sse/sk_gen.h b/crypto_kem/mceliece6688128/sse/sk_gen.h new file mode 100644 index 00000000..fdfb7eec --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_SK_GEN_H +#define PQCLEAN_MCELIECE6688128_SSE_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6688128_SSE_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6688128_SSE_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/sse/syndrome_asm.S b/crypto_kem/mceliece6688128/sse/syndrome_asm.S new file mode 100644 index 00000000..66b16611 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/syndrome_asm.S @@ -0,0 +1,1260 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg128 pp + +# qhasm: reg128 ee + +# qhasm: reg128 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack128 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128_SSE_syndrome_asm +.global PQCLEAN_MCELIECE6688128_SSE_syndrome_asm +_PQCLEAN_MCELIECE6688128_SSE_syndrome_asm: +PQCLEAN_MCELIECE6688128_SSE_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 1044364 +# asm 1: add $1044364,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1664 +# asm 1: mov $1664,>row=int64#5 +# asm 2: mov $1664,>row=%r8 +mov $1664,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rsi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 208 ] +# asm 1: movdqu 208(ee=reg128#2 +# asm 2: movdqu 208(ee=%xmm1 +movdqu 208(%rdx),%xmm1 + +# qhasm: ss &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 16(pp=%xmm1 +movdqu 16(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 224 ] +# asm 1: movdqu 224(ee=reg128#3 +# asm 2: movdqu 224(ee=%xmm2 +movdqu 224(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 32(pp=%xmm1 +movdqu 32(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 240 ] +# asm 1: movdqu 240(ee=reg128#3 +# asm 2: movdqu 240(ee=%xmm2 +movdqu 240(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 48(pp=%xmm1 +movdqu 48(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 256 ] +# asm 1: movdqu 256(ee=reg128#3 +# asm 2: movdqu 256(ee=%xmm2 +movdqu 256(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 64(pp=%xmm1 +movdqu 64(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 272 ] +# asm 1: movdqu 272(ee=reg128#3 +# asm 2: movdqu 272(ee=%xmm2 +movdqu 272(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 80(pp=%xmm1 +movdqu 80(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 288 ] +# asm 1: movdqu 288(ee=reg128#3 +# asm 2: movdqu 288(ee=%xmm2 +movdqu 288(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 96(pp=%xmm1 +movdqu 96(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 304 ] +# asm 1: movdqu 304(ee=reg128#3 +# asm 2: movdqu 304(ee=%xmm2 +movdqu 304(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 112(pp=%xmm1 +movdqu 112(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 320 ] +# asm 1: movdqu 320(ee=reg128#3 +# asm 2: movdqu 320(ee=%xmm2 +movdqu 320(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 128(pp=%xmm1 +movdqu 128(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 336 ] +# asm 1: movdqu 336(ee=reg128#3 +# asm 2: movdqu 336(ee=%xmm2 +movdqu 336(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 144(pp=%xmm1 +movdqu 144(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 352 ] +# asm 1: movdqu 352(ee=reg128#3 +# asm 2: movdqu 352(ee=%xmm2 +movdqu 352(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 160(pp=%xmm1 +movdqu 160(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 368 ] +# asm 1: movdqu 368(ee=reg128#3 +# asm 2: movdqu 368(ee=%xmm2 +movdqu 368(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 176(pp=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 384 ] +# asm 1: movdqu 384(ee=reg128#3 +# asm 2: movdqu 384(ee=%xmm2 +movdqu 384(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 192(pp=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 400 ] +# asm 1: movdqu 400(ee=reg128#3 +# asm 2: movdqu 400(ee=%xmm2 +movdqu 400(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 208(pp=%xmm1 +movdqu 208(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 416 ] +# asm 1: movdqu 416(ee=reg128#3 +# asm 2: movdqu 416(ee=%xmm2 +movdqu 416(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 224(pp=%xmm1 +movdqu 224(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 432 ] +# asm 1: movdqu 432(ee=reg128#3 +# asm 2: movdqu 432(ee=%xmm2 +movdqu 432(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 240(pp=%xmm1 +movdqu 240(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 448 ] +# asm 1: movdqu 448(ee=reg128#3 +# asm 2: movdqu 448(ee=%xmm2 +movdqu 448(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 256(pp=%xmm1 +movdqu 256(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 464 ] +# asm 1: movdqu 464(ee=reg128#3 +# asm 2: movdqu 464(ee=%xmm2 +movdqu 464(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 272(pp=%xmm1 +movdqu 272(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 480 ] +# asm 1: movdqu 480(ee=reg128#3 +# asm 2: movdqu 480(ee=%xmm2 +movdqu 480(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 288(pp=%xmm1 +movdqu 288(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 496 ] +# asm 1: movdqu 496(ee=reg128#3 +# asm 2: movdqu 496(ee=%xmm2 +movdqu 496(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 304(pp=%xmm1 +movdqu 304(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 512 ] +# asm 1: movdqu 512(ee=reg128#3 +# asm 2: movdqu 512(ee=%xmm2 +movdqu 512(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 320(pp=%xmm1 +movdqu 320(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 528 ] +# asm 1: movdqu 528(ee=reg128#3 +# asm 2: movdqu 528(ee=%xmm2 +movdqu 528(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 336(pp=%xmm1 +movdqu 336(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 544 ] +# asm 1: movdqu 544(ee=reg128#3 +# asm 2: movdqu 544(ee=%xmm2 +movdqu 544(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 352(pp=%xmm1 +movdqu 352(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 560 ] +# asm 1: movdqu 560(ee=reg128#3 +# asm 2: movdqu 560(ee=%xmm2 +movdqu 560(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 368(pp=%xmm1 +movdqu 368(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 576 ] +# asm 1: movdqu 576(ee=reg128#3 +# asm 2: movdqu 576(ee=%xmm2 +movdqu 576(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 384(pp=%xmm1 +movdqu 384(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 592 ] +# asm 1: movdqu 592(ee=reg128#3 +# asm 2: movdqu 592(ee=%xmm2 +movdqu 592(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 400(pp=%xmm1 +movdqu 400(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 608 ] +# asm 1: movdqu 608(ee=reg128#3 +# asm 2: movdqu 608(ee=%xmm2 +movdqu 608(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 416(pp=%xmm1 +movdqu 416(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 624 ] +# asm 1: movdqu 624(ee=reg128#3 +# asm 2: movdqu 624(ee=%xmm2 +movdqu 624(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 432(pp=%xmm1 +movdqu 432(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 640 ] +# asm 1: movdqu 640(ee=reg128#3 +# asm 2: movdqu 640(ee=%xmm2 +movdqu 640(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 448(pp=%xmm1 +movdqu 448(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 656 ] +# asm 1: movdqu 656(ee=reg128#3 +# asm 2: movdqu 656(ee=%xmm2 +movdqu 656(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 464(pp=%xmm1 +movdqu 464(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 672 ] +# asm 1: movdqu 672(ee=reg128#3 +# asm 2: movdqu 672(ee=%xmm2 +movdqu 672(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 480(pp=%xmm1 +movdqu 480(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 688 ] +# asm 1: movdqu 688(ee=reg128#3 +# asm 2: movdqu 688(ee=%xmm2 +movdqu 688(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 496(pp=%xmm1 +movdqu 496(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 704 ] +# asm 1: movdqu 704(ee=reg128#3 +# asm 2: movdqu 704(ee=%xmm2 +movdqu 704(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 512(pp=%xmm1 +movdqu 512(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 720 ] +# asm 1: movdqu 720(ee=reg128#3 +# asm 2: movdqu 720(ee=%xmm2 +movdqu 720(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 528(pp=%xmm1 +movdqu 528(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 736 ] +# asm 1: movdqu 736(ee=reg128#3 +# asm 2: movdqu 736(ee=%xmm2 +movdqu 736(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 544(pp=%xmm1 +movdqu 544(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 752 ] +# asm 1: movdqu 752(ee=reg128#3 +# asm 2: movdqu 752(ee=%xmm2 +movdqu 752(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 560(pp=%xmm1 +movdqu 560(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 768 ] +# asm 1: movdqu 768(ee=reg128#3 +# asm 2: movdqu 768(ee=%xmm2 +movdqu 768(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 576(pp=%xmm1 +movdqu 576(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 784 ] +# asm 1: movdqu 784(ee=reg128#3 +# asm 2: movdqu 784(ee=%xmm2 +movdqu 784(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 592(pp=%xmm1 +movdqu 592(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 800 ] +# asm 1: movdqu 800(ee=reg128#3 +# asm 2: movdqu 800(ee=%xmm2 +movdqu 800(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand buf=stack128#1 +# asm 2: movdqa buf=0(%rsp) +movdqa %xmm0,0(%rsp) + +# qhasm: s = mem64[input_1 + 608] +# asm 1: movq 608(s=int64#6 +# asm 2: movq 608(s=%r9 +movq 608(%rsi),%r9 + +# qhasm: e = mem64[input_2 + 816] +# asm 1: movq 816(e=int64#7 +# asm 2: movq 816(e=%rax +movq 816(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 616(p=%rax +movq 616(%rsi),%rax + +# qhasm: e = mem64[input_2 + 824] +# asm 1: movq 824(e=int64#8 +# asm 2: movq 824(e=%r10 +movq 824(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and p=int64#7d +# asm 2: movl 624(p=%eax +movl 624(%rsi),%eax + +# qhasm: e = *(uint32 *)(input_2 + 832) +# asm 1: movl 832(e=int64#8d +# asm 2: movl 832(e=%r10d +movl 832(%rdx),%r10d + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(ee=reg128#2 +# asm 2: movdqu 0(ee=%xmm1 +movdqu 0(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 16(ss=%xmm0 +movdqu 16(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 16 ] +# asm 1: movdqu 16(ee=reg128#2 +# asm 2: movdqu 16(ee=%xmm1 +movdqu 16(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 32(ss=%xmm0 +movdqu 32(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 32 ] +# asm 1: movdqu 32(ee=reg128#2 +# asm 2: movdqu 32(ee=%xmm1 +movdqu 32(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 48(ss=%xmm0 +movdqu 48(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 48 ] +# asm 1: movdqu 48(ee=reg128#2 +# asm 2: movdqu 48(ee=%xmm1 +movdqu 48(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 64(ss=%xmm0 +movdqu 64(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 64 ] +# asm 1: movdqu 64(ee=reg128#2 +# asm 2: movdqu 64(ee=%xmm1 +movdqu 64(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 80(ss=%xmm0 +movdqu 80(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 80 ] +# asm 1: movdqu 80(ee=reg128#2 +# asm 2: movdqu 80(ee=%xmm1 +movdqu 80(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 96(ss=%xmm0 +movdqu 96(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 96 ] +# asm 1: movdqu 96(ee=reg128#2 +# asm 2: movdqu 96(ee=%xmm1 +movdqu 96(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 112(ss=%xmm0 +movdqu 112(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 112 ] +# asm 1: movdqu 112(ee=reg128#2 +# asm 2: movdqu 112(ee=%xmm1 +movdqu 112(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 128(ss=%xmm0 +movdqu 128(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 128 ] +# asm 1: movdqu 128(ee=reg128#2 +# asm 2: movdqu 128(ee=%xmm1 +movdqu 128(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 144(ss=%xmm0 +movdqu 144(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 144 ] +# asm 1: movdqu 144(ee=reg128#2 +# asm 2: movdqu 144(ee=%xmm1 +movdqu 144(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 160(ss=%xmm0 +movdqu 160(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 160 ] +# asm 1: movdqu 160(ee=reg128#2 +# asm 2: movdqu 160(ee=%xmm1 +movdqu 160(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 176(ss=%xmm0 +movdqu 176(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 176 ] +# asm 1: movdqu 176(ee=reg128#2 +# asm 2: movdqu 176(ee=%xmm1 +movdqu 176(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 192(ss=%xmm0 +movdqu 192(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 192 ] +# asm 1: movdqu 192(ee=reg128#2 +# asm 2: movdqu 192(ee=%xmm1 +movdqu 192(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6688128_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6688128_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6688128_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6688128_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6688128_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6688128_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6688128_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6688128_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6688128_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6688128_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6688128_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6688128_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6688128_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6688128_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6688128_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6688128_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6688128_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6688128_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6688128_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6688128_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6688128_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6688128_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#3 +# asm 2: movq 0(s0=%rdx +movq 0(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#4 +# asm 2: movq 8(s1=%rcx +movq 8(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 16(s0=%rdx +movq 16(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(s1=int64#4 +# asm 2: movq 24(s1=%rcx +movq 24(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 32(s0=%rdx +movq 32(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(s1=int64#4 +# asm 2: movq 40(s1=%rcx +movq 40(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 48(s0=%rdx +movq 48(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(s1=int64#4 +# asm 2: movq 56(s1=%rcx +movq 56(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 64(s0=%rdx +movq 64(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(s1=int64#4 +# asm 2: movq 72(s1=%rcx +movq 72(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 80(s0=%rdx +movq 80(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(s1=int64#4 +# asm 2: movq 88(s1=%rcx +movq 88(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 96(s0=%rdx +movq 96(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(s1=int64#4 +# asm 2: movq 104(s1=%rcx +movq 104(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 112(s0=%rdx +movq 112(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(s1=int64#4 +# asm 2: movq 120(s1=%rcx +movq 120(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 128(s0=%rdx +movq 128(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(s1=int64#4 +# asm 2: movq 136(s1=%rcx +movq 136(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 144(s0=%rdx +movq 144(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(s1=int64#4 +# asm 2: movq 152(s1=%rcx +movq 152(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 160(s0=%rdx +movq 160(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(s1=int64#4 +# asm 2: movq 168(s1=%rcx +movq 168(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 176(s0=%rdx +movq 176(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(s1=int64#4 +# asm 2: movq 184(s1=%rcx +movq 184(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 192(s0=%rdx +movq 192(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(s1=int64#4 +# asm 2: movq 200(s1=%rcx +movq 200(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE6688128_SSE_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6688128_SSE_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6688128_SSE_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6688128_SSE_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6688128_SSE_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6688128_SSE_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6688128_SSE_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6688128_SSE_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE6688128_SSE_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE6688128_SSE_vec128_set2x( PQCLEAN_MCELIECE6688128_SSE_load8(in), PQCLEAN_MCELIECE6688128_SSE_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE6688128_SSE_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE6688128_SSE_store8(out + 0, PQCLEAN_MCELIECE6688128_SSE_vec128_extract(in, 0)); + PQCLEAN_MCELIECE6688128_SSE_store8(out + 8, PQCLEAN_MCELIECE6688128_SSE_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece6688128/sse/util.h b/crypto_kem/mceliece6688128/sse/util.h new file mode 100644 index 00000000..4bdce0c9 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_UTIL_H +#define PQCLEAN_MCELIECE6688128_SSE_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE6688128_SSE_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE6688128_SSE_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE6688128_SSE_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE6688128_SSE_load4(const unsigned char *src); +void PQCLEAN_MCELIECE6688128_SSE_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE6688128_SSE_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE6688128_SSE_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE6688128_SSE_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE6688128_SSE_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece6688128/sse/vec128.c b/crypto_kem/mceliece6688128/sse/vec128.c new file mode 100644 index 00000000..6954af3c --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/vec128.c @@ -0,0 +1,152 @@ +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + +#include "vec128.h" + +#include "params.h" + + +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE6688128_SSE_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE6688128_SSE_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE6688128_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE6688128_SSE_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE6688128_SSE_vec128_mul_asm(h, f, g, 16); +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE6688128_SSE_vec128_sq(vec128 *out, const vec128 *in) { + int i; + vec128 result[GFBITS], t; + + t = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[11], in[12]); + + result[0] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[0], in[11]); + result[1] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[7], t); + result[2] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[1], in[7]); + result[3] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[8], t); + result[4] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[2], in[7]); + result[4] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(result[4], in[8]); + result[4] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(result[4], t); + result[5] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[7], in[9]); + result[6] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[3], in[8]); + result[6] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(result[6], in[9]); + result[6] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(result[6], in[12]); + result[7] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[8], in[10]); + result[8] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[4], in[9]); + result[8] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(result[8], in[10]); + result[9] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[9], in[11]); + result[10] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[5], in[10]); + result[10] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(result[10], in[11]); + result[11] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[10], in[12]); + result[12] = PQCLEAN_MCELIECE6688128_SSE_vec128_xor(in[6], t); + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE6688128_SSE_vec128_inv(vec128 *out, const vec128 *in) { + vec128 tmp_11[ GFBITS ]; + vec128 tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE6688128_SSE_vec128_copy(out, in); + + PQCLEAN_MCELIECE6688128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE6688128_SSE_vec128_sq(out, tmp_11); + PQCLEAN_MCELIECE6688128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE6688128_SSE_vec128_sq(out, tmp_1111); + PQCLEAN_MCELIECE6688128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE6688128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128_SSE_vec128_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE6688128_SSE_vec128_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece6688128/sse/vec128.h b/crypto_kem/mceliece6688128/sse/vec128.h new file mode 100644 index 00000000..a567b1ee --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/vec128.h @@ -0,0 +1,43 @@ +#ifndef PQCLEAN_MCELIECE6688128_SSE_VEC128_H +#define PQCLEAN_MCELIECE6688128_SSE_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE6688128_SSE_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE6688128_SSE_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE6688128_SSE_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE6688128_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE6688128_SSE_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE6688128_SSE_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +void PQCLEAN_MCELIECE6688128_SSE_vec128_sq(vec128 *out, const vec128 *in); +void PQCLEAN_MCELIECE6688128_SSE_vec128_inv(vec128 *out, const vec128 *in); +#endif diff --git a/crypto_kem/mceliece6688128/sse/vec128_mul_asm.S b/crypto_kem/mceliece6688128/sse/vec128_mul_asm.S new file mode 100644 index 00000000..68140db4 --- /dev/null +++ b/crypto_kem/mceliece6688128/sse/vec128_mul_asm.S @@ -0,0 +1,2127 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 a0 + +# qhasm: reg128 a1 + +# qhasm: reg128 a2 + +# qhasm: reg128 a3 + +# qhasm: reg128 a4 + +# qhasm: reg128 a5 + +# qhasm: reg128 a6 + +# qhasm: reg128 a7 + +# qhasm: reg128 a8 + +# qhasm: reg128 a9 + +# qhasm: reg128 a10 + +# qhasm: reg128 a11 + +# qhasm: reg128 a12 + +# qhasm: reg128 b0 + +# qhasm: reg128 b1 + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 r5 + +# qhasm: reg128 r6 + +# qhasm: reg128 r7 + +# qhasm: reg128 r8 + +# qhasm: reg128 r9 + +# qhasm: reg128 r10 + +# qhasm: reg128 r11 + +# qhasm: reg128 r12 + +# qhasm: reg128 r13 + +# qhasm: reg128 r14 + +# qhasm: reg128 r15 + +# qhasm: reg128 r16 + +# qhasm: reg128 r17 + +# qhasm: reg128 r18 + +# qhasm: reg128 r19 + +# qhasm: reg128 r20 + +# qhasm: reg128 r21 + +# qhasm: reg128 r22 + +# qhasm: reg128 r23 + +# qhasm: reg128 r24 + +# qhasm: reg128 r + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128_SSE_vec128_mul_asm +.global PQCLEAN_MCELIECE6688128_SSE_vec128_mul_asm +_PQCLEAN_MCELIECE6688128_SSE_vec128_mul_asm: +PQCLEAN_MCELIECE6688128_SSE_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(b0=reg128#1 +# asm 2: movdqu 0(b0=%xmm0 +movdqu 0(%rdx),%xmm0 + +# qhasm: a12 = mem128[ input_1 + 192 ] +# asm 1: movdqu 192(a12=reg128#2 +# asm 2: movdqu 192(a12=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg128#3 +# asm 2: vpand r12=%xmm2 +vpand %xmm0,%xmm1,%xmm2 + +# qhasm: r13 = a12 & mem128[input_2 + 16] +# asm 1: vpand 16(r13=reg128#4 +# asm 2: vpand 16(r13=%xmm3 +vpand 16(%rdx),%xmm1,%xmm3 + +# qhasm: r14 = a12 & mem128[input_2 + 32] +# asm 1: vpand 32(r14=reg128#5 +# asm 2: vpand 32(r14=%xmm4 +vpand 32(%rdx),%xmm1,%xmm4 + +# qhasm: r15 = a12 & mem128[input_2 + 48] +# asm 1: vpand 48(r15=reg128#6 +# asm 2: vpand 48(r15=%xmm5 +vpand 48(%rdx),%xmm1,%xmm5 + +# qhasm: r16 = a12 & mem128[input_2 + 64] +# asm 1: vpand 64(r16=reg128#7 +# asm 2: vpand 64(r16=%xmm6 +vpand 64(%rdx),%xmm1,%xmm6 + +# qhasm: r17 = a12 & mem128[input_2 + 80] +# asm 1: vpand 80(r17=reg128#8 +# asm 2: vpand 80(r17=%xmm7 +vpand 80(%rdx),%xmm1,%xmm7 + +# qhasm: r18 = a12 & mem128[input_2 + 96] +# asm 1: vpand 96(r18=reg128#9 +# asm 2: vpand 96(r18=%xmm8 +vpand 96(%rdx),%xmm1,%xmm8 + +# qhasm: r19 = a12 & mem128[input_2 + 112] +# asm 1: vpand 112(r19=reg128#10 +# asm 2: vpand 112(r19=%xmm9 +vpand 112(%rdx),%xmm1,%xmm9 + +# qhasm: r20 = a12 & mem128[input_2 + 128] +# asm 1: vpand 128(r20=reg128#11 +# asm 2: vpand 128(r20=%xmm10 +vpand 128(%rdx),%xmm1,%xmm10 + +# qhasm: r21 = a12 & mem128[input_2 + 144] +# asm 1: vpand 144(r21=reg128#12 +# asm 2: vpand 144(r21=%xmm11 +vpand 144(%rdx),%xmm1,%xmm11 + +# qhasm: r22 = a12 & mem128[input_2 + 160] +# asm 1: vpand 160(r22=reg128#13 +# asm 2: vpand 160(r22=%xmm12 +vpand 160(%rdx),%xmm1,%xmm12 + +# qhasm: r23 = a12 & mem128[input_2 + 176] +# asm 1: vpand 176(r23=reg128#14 +# asm 2: vpand 176(r23=%xmm13 +vpand 176(%rdx),%xmm1,%xmm13 + +# qhasm: r24 = a12 & mem128[input_2 + 192] +# asm 1: vpand 192(r24=reg128#2 +# asm 2: vpand 192(r24=%xmm1 +vpand 192(%rdx),%xmm1,%xmm1 + +# qhasm: r15 ^= r24 +# asm 1: pxor r11=reg128#2 +# asm 2: movdqa r11=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: a11 = mem128[ input_1 + 176 ] +# asm 1: movdqu 176(a11=reg128#15 +# asm 2: movdqu 176(a11=%xmm14 +movdqu 176(%rsi),%xmm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r22 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r23 ^= r +# asm 1: pxor r10=reg128#14 +# asm 2: movdqa r10=%xmm13 +movdqa %xmm13,%xmm13 + +# qhasm: a10 = mem128[ input_1 + 160 ] +# asm 1: movdqu 160(a10=reg128#15 +# asm 2: movdqu 160(a10=%xmm14 +movdqu 160(%rsi),%xmm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r22 ^= r +# asm 1: pxor r9=reg128#13 +# asm 2: movdqa r9=%xmm12 +movdqa %xmm12,%xmm12 + +# qhasm: a9 = mem128[ input_1 + 144 ] +# asm 1: movdqu 144(a9=reg128#15 +# asm 2: movdqu 144(a9=%xmm14 +movdqu 144(%rsi),%xmm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r21 ^= r +# asm 1: pxor r8=reg128#12 +# asm 2: movdqa r8=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: a8 = mem128[ input_1 + 128 ] +# asm 1: movdqu 128(a8=reg128#15 +# asm 2: movdqu 128(a8=%xmm14 +movdqu 128(%rsi),%xmm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r20 ^= r +# asm 1: pxor r7=reg128#11 +# asm 2: movdqa r7=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: a7 = mem128[ input_1 + 112 ] +# asm 1: movdqu 112(a7=reg128#15 +# asm 2: movdqu 112(a7=%xmm14 +movdqu 112(%rsi),%xmm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r6=reg128#10 +# asm 2: movdqa r6=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: a6 = mem128[ input_1 + 96 ] +# asm 1: movdqu 96(a6=reg128#15 +# asm 2: movdqu 96(a6=%xmm14 +movdqu 96(%rsi),%xmm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r5=reg128#9 +# asm 2: movdqa r5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: a5 = mem128[ input_1 + 80 ] +# asm 1: movdqu 80(a5=reg128#15 +# asm 2: movdqu 80(a5=%xmm14 +movdqu 80(%rsi),%xmm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r4=reg128#8 +# asm 2: movdqa r4=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: a4 = mem128[ input_1 + 64 ] +# asm 1: movdqu 64(a4=reg128#15 +# asm 2: movdqu 64(a4=%xmm14 +movdqu 64(%rsi),%xmm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r3=reg128#7 +# asm 2: movdqa r3=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: a3 = mem128[ input_1 + 48 ] +# asm 1: movdqu 48(a3=reg128#15 +# asm 2: movdqu 48(a3=%xmm14 +movdqu 48(%rsi),%xmm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r2=reg128#6 +# asm 2: movdqa r2=%xmm5 +movdqa %xmm5,%xmm5 + +# qhasm: a2 = mem128[ input_1 + 32 ] +# asm 1: movdqu 32(a2=reg128#15 +# asm 2: movdqu 32(a2=%xmm14 +movdqu 32(%rsi),%xmm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r1=reg128#5 +# asm 2: movdqa r1=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: a1 = mem128[ input_1 + 16 ] +# asm 1: movdqu 16(a1=reg128#15 +# asm 2: movdqu 16(a1=%xmm14 +movdqu 16(%rsi),%xmm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r0=reg128#4 +# asm 2: movdqa r0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: a0 = mem128[ input_1 + 0 ] +# asm 1: movdqu 0(a0=reg128#15 +# asm 2: movdqu 0(a0=%xmm14 +movdqu 0(%rsi),%xmm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm0,%xmm14,%xmm0 + +# qhasm: r0 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 16(r=%xmm0 +vpand 16(%rdx),%xmm14,%xmm0 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 32(r=%xmm0 +vpand 32(%rdx),%xmm14,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 48(r=%xmm0 +vpand 48(%rdx),%xmm14,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 64(r=%xmm0 +vpand 64(%rdx),%xmm14,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 80(r=%xmm0 +vpand 80(%rdx),%xmm14,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 96(r=%xmm0 +vpand 96(%rdx),%xmm14,%xmm0 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 112(r=%xmm0 +vpand 112(%rdx),%xmm14,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 128(r=%xmm0 +vpand 128(%rdx),%xmm14,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 144(r=%xmm0 +vpand 144(%rdx),%xmm14,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 160(r=%xmm0 +vpand 160(%rdx),%xmm14,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 176(r=%xmm0 +vpand 176(%rdx),%xmm14,%xmm0 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 192(r=%xmm0 +vpand 192(%rdx),%xmm14,%xmm0 + +# qhasm: r12 ^= r +# asm 1: pxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6688128_VEC_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6688128/vec/api.h b/crypto_kem/mceliece6688128/vec/api.h new file mode 100644 index 00000000..adf2ccdf --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_API_H +#define PQCLEAN_MCELIECE6688128_VEC_API_H + +#include + +#define PQCLEAN_MCELIECE6688128_VEC_CRYPTO_ALGNAME "Classic McEliece 6688128" +#define PQCLEAN_MCELIECE6688128_VEC_CRYPTO_PUBLICKEYBYTES 1044992 +#define PQCLEAN_MCELIECE6688128_VEC_CRYPTO_SECRETKEYBYTES 13892 +#define PQCLEAN_MCELIECE6688128_VEC_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE6688128_VEC_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6688128_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6688128_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6688128_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/benes.c b/crypto_kem/mceliece6688128/vec/benes.c new file mode 100644 index 00000000..69bde6c7 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/benes.c @@ -0,0 +1,147 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" +#include "vec.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6688128_VEC_benes(vec *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = r[i * 2 + 0]; + r_int_v[1][i] = r[i * 2 + 1]; + } + + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + r[i * 2 + 0] = r_int_v[0][i]; + r[i * 2 + 1] = r_int_v[1][i]; + } +} + diff --git a/crypto_kem/mceliece6688128/vec/benes.h b/crypto_kem/mceliece6688128/vec/benes.h new file mode 100644 index 00000000..943fa463 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/benes.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_BENES_H +#define PQCLEAN_MCELIECE6688128_VEC_BENES_H +/* + This file is for Benes network related functions +*/ + +#include + +void PQCLEAN_MCELIECE6688128_VEC_benes(uint64_t * /*r*/, const unsigned char * /*bits*/, int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/bm.c b/crypto_kem/mceliece6688128/vec/bm.c new file mode 100644 index 00000000..540889da --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/bm.c @@ -0,0 +1,245 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline void vec_cmov(vec *out, const vec *in, uint16_t mask) { + int i; + + vec m0, m1; + + m0 = PQCLEAN_MCELIECE6688128_VEC_vec_set1_16b(mask); + m1 = ~m0; + + for (i = 0; i < GFBITS; i++) { + out[i] = (in[i] & m0) | (out[i] & m1); + out[i] = (in[i] & m0) | (out[i] & m1); + } +} + +static inline void interleave(vec *in, int idx0, int idx1, const vec *mask, int b) { + int s = 1 << b; + + vec x, y; + + x = (in[idx0] & mask[0]) | ((in[idx1] & mask[0]) << s); + y = ((in[idx0] & mask[1]) >> s) | (in[idx1] & mask[1]); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, const vec *in) { + int i, k; + + vec mask[4][2]; + vec buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = 0; + } + + mask[0][0] = PQCLEAN_MCELIECE6688128_VEC_vec_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6688128_VEC_vec_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6688128_VEC_vec_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6688128_VEC_vec_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6688128_VEC_vec_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6688128_VEC_vec_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6688128_VEC_vec_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6688128_VEC_vec_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ k * 16 + i ] = (buf[i] >> (k * 16)) & GFMASK; + } + } +} + +static void update(vec in[][GFBITS], const gf e) { + int i; + vec tmp; + + for (i = 0; i < GFBITS; i++) { + tmp = (e >> i) & 1; + + in[0][i] = (in[0][i] >> 1) | (in[1][i] << 63); + in[1][i] = (in[1][i] >> 1) | (tmp << 63); + } +} + +static inline gf vec_reduce(vec in[][GFBITS]) { + int i; + vec tmp; + gf ret = 0; + + for (i = GFBITS - 1; i >= 0; i--) { + tmp = in[0][i] ^ in[1][i]; + + tmp ^= tmp >> 32; + tmp ^= tmp >> 16; + tmp ^= tmp >> 8; + tmp ^= tmp >> 4; + tmp ^= tmp >> 2; + tmp ^= tmp >> 1; + + ret <<= 1; + ret |= tmp & 1; + } + + return ret; +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6688128_VEC_bm(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec prod[2][GFBITS]; + vec interval[2][GFBITS]; + vec dd[2][GFBITS], bb[2][GFBITS]; + vec B[2][GFBITS], C[2][GFBITS]; + vec B_tmp[2][GFBITS], C_tmp[2][GFBITS]; + vec v[GFBITS]; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[ 64], in[1]); + get_coefs(&coefs[128], in[2]); + get_coefs(&coefs[192], in[3]); + + C[0][0] = 0; + C[1][0] = 0; + B[0][0] = 0; + B[1][0] = one << 63; + + for (i = 1; i < GFBITS; i++) { + C[0][i] = C[1][i] = B[0][i] = B[1][i] = 0; + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[0][i] = interval[1][i] = 0; + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE6688128_VEC_vec_mul(prod[0], C[0], interval[0]); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(prod[1], C[1], interval[1]); + update(interval, coefs[N]); + d = vec_reduce(prod); + + t = PQCLEAN_MCELIECE6688128_VEC_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[0][i] = dd[1][i] = PQCLEAN_MCELIECE6688128_VEC_vec_setbits((d >> i) & 1); + bb[0][i] = bb[1][i] = PQCLEAN_MCELIECE6688128_VEC_vec_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6688128_VEC_vec_mul(B_tmp[0], dd[0], B[0]); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(B_tmp[1], dd[1], B[1]); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(C_tmp[0], bb[0], C[0]); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(C_tmp[1], bb[1], C[1]); + + vec_cmov(B[0], C[0], mask); + vec_cmov(B[1], C[1], mask); + update(B, c0 & mask); + + for (i = 0; i < GFBITS; i++) { + C[0][i] = B_tmp[0][i] ^ C_tmp[0][i]; + C[1][i] = B_tmp[1][i] ^ C_tmp[1][i]; + } + + c0 = (gf)(t >> 32); + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE6688128_VEC_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + v[i] = PQCLEAN_MCELIECE6688128_VEC_vec_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE6688128_VEC_vec_mul(out[0], C[0], v); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(out[1], C[1], v); +} + diff --git a/crypto_kem/mceliece6688128/vec/bm.h b/crypto_kem/mceliece6688128/vec/bm.h new file mode 100644 index 00000000..f19ce901 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/bm.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_BM_H +#define PQCLEAN_MCELIECE6688128_VEC_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE6688128_VEC_bm(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/consts.inc b/crypto_kem/mceliece6688128/vec/consts.inc new file mode 100644 index 00000000..1875ca4d --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/consts.inc @@ -0,0 +1,1920 @@ +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x9999999966666666, + 0x3C3CC3C3C3C33C3C, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xCC33CC3333CC33CC, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0x00FFFF0000FFFF00 +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x6666666699999999, + 0xC3C33C3C3C3CC3C3, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x33CC33CCCC33CC33, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0xFF0000FFFF0000FF +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x9669966969966996, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x6996699696699669, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x6996699696699669, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x9669966969966996, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, diff --git a/crypto_kem/mceliece6688128/vec/controlbits.c b/crypto_kem/mceliece6688128/vec/controlbits.c new file mode 100644 index 00000000..039cf2e8 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6688128_VEC_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6688128_VEC_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6688128_VEC_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6688128_VEC_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6688128/vec/controlbits.h b/crypto_kem/mceliece6688128/vec/controlbits.h new file mode 100644 index 00000000..6cea6671 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_CONTROLBITS_H +#define PQCLEAN_MCELIECE6688128_VEC_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6688128_VEC_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6688128_VEC_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/crypto_hash.h b/crypto_kem/mceliece6688128/vec/crypto_hash.h new file mode 100644 index 00000000..3d284110 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6688128_VEC_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6688128/vec/decrypt.c b/crypto_kem/mceliece6688128/vec/decrypt.c new file mode 100644 index 00000000..c675d433 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/decrypt.c @@ -0,0 +1,191 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" +#include "vec.h" + +#include + +static void scaling(vec out[][GFBITS], vec inv[][GFBITS], const unsigned char *sk, const vec *recv) { + int i, j; + + vec irr_int[2][ GFBITS ]; + vec eval[128][ GFBITS ]; + vec tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE6688128_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6688128_VEC_fft(eval, irr_int); + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE6688128_VEC_vec_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6688128_VEC_vec_copy(inv[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE6688128_VEC_vec_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128_VEC_vec_inv(tmp, inv[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE6688128_VEC_vec_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128_VEC_vec_copy(inv[0], tmp); + + // + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static void preprocess(vec *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 128; i++) { + recv[i] = PQCLEAN_MCELIECE6688128_VEC_load8(r + i * 8); + } +} + +static void postprocess(unsigned char *e, vec *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE6688128_VEC_store8(error8 + i * 8, err[i]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec out[][GFBITS], vec inv[][GFBITS], const vec *recv) { + int i, j; + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static int weight_check(const unsigned char *e, const vec *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < (1 << GFBITS); i++) { + w0 += (error[i / 64] >> (i % 64)) & 1; + } + + for (i = 0; i < SYS_N; i++) { + w1 += (e[i / 8] >> (i % 8)) & 1; + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec s0[][ GFBITS ], vec s1[][ GFBITS ]) { + int i, j; + vec diff = 0; + + for (i = 0; i < 4; i++) { + for (j = 0; j < GFBITS; j++) { + diff |= (s0[i][j] ^ s1[i][j]); + } + } + + return (uint16_t)PQCLEAN_MCELIECE6688128_VEC_vec_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6688128_VEC_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec inv[ 128 ][ GFBITS ]; + vec scaled[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + + vec error[ 128 ]; + + vec s_priv[ 4 ][ GFBITS ]; + vec s_priv_cmp[ 4 ][ GFBITS ]; + vec locator[2][ GFBITS ]; + + vec recv[ 128 ]; + vec allone; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE6688128_VEC_benes(recv, sk + IRR_BYTES, 1); + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE6688128_VEC_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6688128_VEC_bm(locator, s_priv); + + PQCLEAN_MCELIECE6688128_VEC_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6688128_VEC_vec_setbits(1); + + for (i = 0; i < 128; i++) { + error[i] = PQCLEAN_MCELIECE6688128_VEC_vec_or_reduce(eval[i]); + error[i] ^= allone; + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE6688128_VEC_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE6688128_VEC_benes(error, sk + IRR_BYTES, 0); + + postprocess(e, error); + + check_weight = (uint16_t)weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece6688128/vec/decrypt.h b/crypto_kem/mceliece6688128/vec/decrypt.h new file mode 100644 index 00000000..8b7a202c --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_DECRYPT_H +#define PQCLEAN_MCELIECE6688128_VEC_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6688128_VEC_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/encrypt.c b/crypto_kem/mceliece6688128/vec/encrypt.c new file mode 100644 index 00000000..9309b952 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/encrypt.c @@ -0,0 +1,142 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind[ SYS_T * 2 ]; + uint8_t *ind8 = (uint8_t *)ind; + uint32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes(ind8, sizeof(ind)); + for (i = 0; i < sizeof(ind); i += 2) { + ind[i / 2] = (uint16_t)ind8[i + 1] << 8 | ind8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind32[i] == ind32[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6688128_VEC_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + uint64_t b; + + const uint8_t *e_ptr8 = e + SYND_BYTES; + const uint8_t *pk_ptr8; + + int i, j; + + // + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = e[i]; + } + + for (i = 0; i < PK_NROWS; i++) { + pk_ptr8 = pk + PK_ROW_BYTES * i; + + b = 0; + for (j = 0; j < PK_NCOLS / 64; j++) { + b ^= PQCLEAN_MCELIECE6688128_VEC_load8(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE6688128_VEC_load8(e_ptr8 + 8 * j); + } + + b ^= PQCLEAN_MCELIECE6688128_VEC_load4(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE6688128_VEC_load4(e_ptr8 + 8 * j); + + b ^= b >> 32; + b ^= b >> 16; + b ^= b >> 8; + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] ^= (b << (i % 8)); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6688128_VEC_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece6688128/vec/encrypt.h b/crypto_kem/mceliece6688128/vec/encrypt.h new file mode 100644 index 00000000..44aa40ca --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_ENCRYPT_H +#define PQCLEAN_MCELIECE6688128_VEC_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6688128_VEC_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/fft.c b/crypto_kem/mceliece6688128/vec/fft.c new file mode 100644 index 00000000..294f5801 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/fft.c @@ -0,0 +1,274 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec in[][GFBITS]) { + int i, j, k; + + const vec mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const vec s[5][2][GFBITS] = { +#include "scalars_2x.inc" + }; + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[1][i] >> 32; + in[0][i] ^= in[1][i] << 32; + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[0][i] ^= (in[0][i] & mask[k][0]) >> (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) >> (1 << k); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6688128_VEC_vec_mul(in[0], in[0], s[j][0]); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(in[1], in[1], s[j][1]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[8][ GFBITS ]; + vec buf[128]; + + uint64_t consts_ptr = 2; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec powers[ 128 ][ GFBITS ] = { +#include "powers.inc" + }; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[7] = {2522, 7827, 7801, 8035, 6897, 8167, 3476}; + + // + + for (i = 0; i < 7; i++) { + for (j = 0; j < GFBITS; j++) { + pre[i][j] = (beta[i] >> j) & 1; + pre[i][j] = -pre[i][j]; + } + + PQCLEAN_MCELIECE6688128_VEC_vec_mul(pre[i], in[1], pre[i]); + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = in[0][i]; + + buf[1] = buf[0] ^ pre[0][i]; + buf[32] = in[0][i] ^ pre[5][i]; + buf[3] = buf[1] ^ pre[1][i]; + buf[96] = buf[32] ^ pre[6][i]; + buf[97] = buf[96] ^ pre[0][i]; + buf[2] = in[0][i] ^ pre[1][i]; + buf[99] = buf[97] ^ pre[1][i]; + buf[6] = buf[2] ^ pre[2][i]; + buf[98] = buf[99] ^ pre[0][i]; + buf[7] = buf[6] ^ pre[0][i]; + buf[102] = buf[98] ^ pre[2][i]; + buf[5] = buf[7] ^ pre[1][i]; + buf[103] = buf[102] ^ pre[0][i]; + buf[101] = buf[103] ^ pre[1][i]; + buf[4] = in[0][i] ^ pre[2][i]; + buf[100] = buf[101] ^ pre[0][i]; + buf[12] = buf[4] ^ pre[3][i]; + buf[108] = buf[100] ^ pre[3][i]; + buf[13] = buf[12] ^ pre[0][i]; + buf[109] = buf[108] ^ pre[0][i]; + buf[15] = buf[13] ^ pre[1][i]; + buf[111] = buf[109] ^ pre[1][i]; + buf[14] = buf[15] ^ pre[0][i]; + buf[110] = buf[111] ^ pre[0][i]; + buf[10] = buf[14] ^ pre[2][i]; + buf[106] = buf[110] ^ pre[2][i]; + buf[11] = buf[10] ^ pre[0][i]; + buf[107] = buf[106] ^ pre[0][i]; + buf[9] = buf[11] ^ pre[1][i]; + buf[105] = buf[107] ^ pre[1][i]; + buf[104] = buf[105] ^ pre[0][i]; + buf[8] = in[0][i] ^ pre[3][i]; + buf[120] = buf[104] ^ pre[4][i]; + buf[24] = buf[8] ^ pre[4][i]; + buf[121] = buf[120] ^ pre[0][i]; + buf[25] = buf[24] ^ pre[0][i]; + buf[123] = buf[121] ^ pre[1][i]; + buf[27] = buf[25] ^ pre[1][i]; + buf[122] = buf[123] ^ pre[0][i]; + buf[26] = buf[27] ^ pre[0][i]; + buf[126] = buf[122] ^ pre[2][i]; + buf[30] = buf[26] ^ pre[2][i]; + buf[127] = buf[126] ^ pre[0][i]; + buf[31] = buf[30] ^ pre[0][i]; + buf[125] = buf[127] ^ pre[1][i]; + buf[29] = buf[31] ^ pre[1][i]; + buf[124] = buf[125] ^ pre[0][i]; + buf[28] = buf[29] ^ pre[0][i]; + buf[116] = buf[124] ^ pre[3][i]; + buf[20] = buf[28] ^ pre[3][i]; + buf[117] = buf[116] ^ pre[0][i]; + buf[21] = buf[20] ^ pre[0][i]; + buf[119] = buf[117] ^ pre[1][i]; + buf[23] = buf[21] ^ pre[1][i]; + buf[118] = buf[119] ^ pre[0][i]; + buf[22] = buf[23] ^ pre[0][i]; + buf[114] = buf[118] ^ pre[2][i]; + buf[18] = buf[22] ^ pre[2][i]; + buf[115] = buf[114] ^ pre[0][i]; + buf[19] = buf[18] ^ pre[0][i]; + buf[113] = buf[115] ^ pre[1][i]; + buf[17] = buf[19] ^ pre[1][i]; + buf[112] = buf[113] ^ pre[0][i]; + buf[80] = buf[112] ^ pre[5][i]; + buf[16] = in[0][i] ^ pre[4][i]; + buf[81] = buf[80] ^ pre[0][i]; + buf[48] = buf[16] ^ pre[5][i]; + buf[83] = buf[81] ^ pre[1][i]; + buf[49] = buf[48] ^ pre[0][i]; + buf[82] = buf[83] ^ pre[0][i]; + buf[51] = buf[49] ^ pre[1][i]; + buf[86] = buf[82] ^ pre[2][i]; + buf[50] = buf[51] ^ pre[0][i]; + buf[87] = buf[86] ^ pre[0][i]; + buf[54] = buf[50] ^ pre[2][i]; + buf[85] = buf[87] ^ pre[1][i]; + buf[55] = buf[54] ^ pre[0][i]; + buf[84] = buf[85] ^ pre[0][i]; + buf[53] = buf[55] ^ pre[1][i]; + buf[92] = buf[84] ^ pre[3][i]; + buf[52] = buf[53] ^ pre[0][i]; + buf[93] = buf[92] ^ pre[0][i]; + buf[60] = buf[52] ^ pre[3][i]; + buf[95] = buf[93] ^ pre[1][i]; + buf[61] = buf[60] ^ pre[0][i]; + buf[94] = buf[95] ^ pre[0][i]; + buf[63] = buf[61] ^ pre[1][i]; + buf[90] = buf[94] ^ pre[2][i]; + buf[62] = buf[63] ^ pre[0][i]; + buf[91] = buf[90] ^ pre[0][i]; + buf[58] = buf[62] ^ pre[2][i]; + buf[89] = buf[91] ^ pre[1][i]; + buf[59] = buf[58] ^ pre[0][i]; + buf[88] = buf[89] ^ pre[0][i]; + buf[57] = buf[59] ^ pre[1][i]; + buf[72] = buf[88] ^ pre[4][i]; + buf[56] = buf[57] ^ pre[0][i]; + buf[73] = buf[72] ^ pre[0][i]; + buf[40] = buf[56] ^ pre[4][i]; + buf[75] = buf[73] ^ pre[1][i]; + buf[41] = buf[40] ^ pre[0][i]; + buf[74] = buf[75] ^ pre[0][i]; + buf[43] = buf[41] ^ pre[1][i]; + buf[78] = buf[74] ^ pre[2][i]; + buf[42] = buf[43] ^ pre[0][i]; + buf[79] = buf[78] ^ pre[0][i]; + buf[46] = buf[42] ^ pre[2][i]; + buf[77] = buf[79] ^ pre[1][i]; + buf[47] = buf[46] ^ pre[0][i]; + buf[76] = buf[77] ^ pre[0][i]; + buf[45] = buf[47] ^ pre[1][i]; + buf[68] = buf[76] ^ pre[3][i]; + buf[44] = buf[45] ^ pre[0][i]; + buf[69] = buf[68] ^ pre[0][i]; + buf[36] = buf[44] ^ pre[3][i]; + buf[71] = buf[69] ^ pre[1][i]; + buf[37] = buf[36] ^ pre[0][i]; + buf[70] = buf[71] ^ pre[0][i]; + buf[39] = buf[37] ^ pre[1][i]; + buf[66] = buf[70] ^ pre[2][i]; + buf[38] = buf[39] ^ pre[0][i]; + buf[67] = buf[66] ^ pre[0][i]; + buf[34] = buf[38] ^ pre[2][i]; + buf[65] = buf[67] ^ pre[1][i]; + buf[35] = buf[34] ^ pre[0][i]; + buf[33] = buf[35] ^ pre[1][i]; + buf[64] = in[0][i] ^ pre[6][i]; + + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(buf + 0, buf + 0); + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(buf + 64, buf + 64); + + for (j = 0; j < 128; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 1; i <= 6; i++) { + s = 1 << i; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6688128_VEC_vec_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += ((uint64_t)1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 128; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] ^= powers[i][b]; + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6688128_VEC_fft(vec out[][GFBITS], vec in[][GFBITS]) { + radix_conversions(in); + butterflies(out, in); +} diff --git a/crypto_kem/mceliece6688128/vec/fft.h b/crypto_kem/mceliece6688128/vec/fft.h new file mode 100644 index 00000000..0b6e15ab --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_FFT_H +#define PQCLEAN_MCELIECE6688128_VEC_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE6688128_VEC_fft(vec out[][ GFBITS ], vec in[][GFBITS]); + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/fft_tr.c b/crypto_kem/mceliece6688128/vec/fft_tr.c new file mode 100644 index 00000000..47d03ef8 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/fft_tr.c @@ -0,0 +1,289 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec in[][ GFBITS ]) { + int i, j, k; + + const vec mask[6][2] = { + {0x2222222222222222, 0x4444444444444444}, + {0x0C0C0C0C0C0C0C0C, 0x3030303030303030}, + {0x00F000F000F000F0, 0x0F000F000F000F00}, + {0x0000FF000000FF00, 0x00FF000000FF0000}, + {0x00000000FFFF0000, 0x0000FFFF00000000}, + {0xFFFFFFFF00000000, 0x00000000FFFFFFFF} + }; + + const vec s[6][4][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6688128_VEC_vec_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE6688128_VEC_vec_mul(in[1], in[1], s[j][1]); // scaling + PQCLEAN_MCELIECE6688128_VEC_vec_mul(in[2], in[2], s[j][2]); // scaling + PQCLEAN_MCELIECE6688128_VEC_vec_mul(in[3], in[3], s[j][3]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + in[0][i] ^= (in[0][i] & mask[k][0]) << (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][0]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][1]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][0]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][1]) << (1 << k); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[0][i] >> 32; + in[1][i] ^= in[1][i] << 32; + + in[3][i] ^= in[2][i] >> 32; + in[3][i] ^= in[3][i] << 32; + } + } + + for (i = 0; i < GFBITS; i++) { + in[3][i] ^= in[2][i] ^= in[1][i]; + } + } +} + +static void butterflies_tr(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[6][2][ GFBITS ]; + vec buf[2][64]; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 128; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 6; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + + PQCLEAN_MCELIECE6688128_VEC_vec_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tmp[b]; + } + } + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 128; k++) { + (&buf[0][0])[ k ] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(buf[0], buf[0]); + PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(buf[1], buf[1]); + + for (k = 0; k < 2; k++) { + pre[0][k][i] = buf[k][32]; + buf[k][33] ^= buf[k][32]; + pre[1][k][i] = buf[k][33]; + buf[k][35] ^= buf[k][33]; + pre[0][k][i] ^= buf[k][35]; + buf[k][34] ^= buf[k][35]; + pre[2][k][i] = buf[k][34]; + buf[k][38] ^= buf[k][34]; + pre[0][k][i] ^= buf[k][38]; + buf[k][39] ^= buf[k][38]; + pre[1][k][i] ^= buf[k][39]; + buf[k][37] ^= buf[k][39]; + pre[0][k][i] ^= buf[k][37]; + buf[k][36] ^= buf[k][37]; + pre[3][k][i] = buf[k][36]; + buf[k][44] ^= buf[k][36]; + pre[0][k][i] ^= buf[k][44]; + buf[k][45] ^= buf[k][44]; + pre[1][k][i] ^= buf[k][45]; + buf[k][47] ^= buf[k][45]; + pre[0][k][i] ^= buf[k][47]; + buf[k][46] ^= buf[k][47]; + pre[2][k][i] ^= buf[k][46]; + buf[k][42] ^= buf[k][46]; + pre[0][k][i] ^= buf[k][42]; + buf[k][43] ^= buf[k][42]; + pre[1][k][i] ^= buf[k][43]; + buf[k][41] ^= buf[k][43]; + pre[0][k][i] ^= buf[k][41]; + buf[k][40] ^= buf[k][41]; + pre[4][k][i] = buf[k][40]; + buf[k][56] ^= buf[k][40]; + pre[0][k][i] ^= buf[k][56]; + buf[k][57] ^= buf[k][56]; + pre[1][k][i] ^= buf[k][57]; + buf[k][59] ^= buf[k][57]; + pre[0][k][i] ^= buf[k][59]; + buf[k][58] ^= buf[k][59]; + pre[2][k][i] ^= buf[k][58]; + buf[k][62] ^= buf[k][58]; + pre[0][k][i] ^= buf[k][62]; + buf[k][63] ^= buf[k][62]; + pre[1][k][i] ^= buf[k][63]; + buf[k][61] ^= buf[k][63]; + pre[0][k][i] ^= buf[k][61]; + buf[k][60] ^= buf[k][61]; + pre[3][k][i] ^= buf[k][60]; + buf[k][52] ^= buf[k][60]; + pre[0][k][i] ^= buf[k][52]; + buf[k][53] ^= buf[k][52]; + pre[1][k][i] ^= buf[k][53]; + buf[k][55] ^= buf[k][53]; + pre[0][k][i] ^= buf[k][55]; + buf[k][54] ^= buf[k][55]; + pre[2][k][i] ^= buf[k][54]; + buf[k][50] ^= buf[k][54]; + pre[0][k][i] ^= buf[k][50]; + buf[k][51] ^= buf[k][50]; + pre[1][k][i] ^= buf[k][51]; + buf[k][49] ^= buf[k][51]; + pre[0][k][i] ^= buf[k][49]; + buf[k][48] ^= buf[k][49]; + pre[5][k][i] = buf[k][48]; + buf[k][16] ^= buf[k][48]; + pre[0][k][i] ^= buf[k][16]; + buf[k][17] ^= buf[k][16]; + pre[1][k][i] ^= buf[k][17]; + buf[k][19] ^= buf[k][17]; + pre[0][k][i] ^= buf[k][19]; + buf[k][18] ^= buf[k][19]; + pre[2][k][i] ^= buf[k][18]; + buf[k][22] ^= buf[k][18]; + pre[0][k][i] ^= buf[k][22]; + buf[k][23] ^= buf[k][22]; + pre[1][k][i] ^= buf[k][23]; + buf[k][21] ^= buf[k][23]; + pre[0][k][i] ^= buf[k][21]; + buf[k][20] ^= buf[k][21]; + pre[3][k][i] ^= buf[k][20]; + buf[k][28] ^= buf[k][20]; + pre[0][k][i] ^= buf[k][28]; + buf[k][29] ^= buf[k][28]; + pre[1][k][i] ^= buf[k][29]; + buf[k][31] ^= buf[k][29]; + pre[0][k][i] ^= buf[k][31]; + buf[k][30] ^= buf[k][31]; + pre[2][k][i] ^= buf[k][30]; + buf[k][26] ^= buf[k][30]; + pre[0][k][i] ^= buf[k][26]; + buf[k][27] ^= buf[k][26]; + pre[1][k][i] ^= buf[k][27]; + buf[k][25] ^= buf[k][27]; + pre[0][k][i] ^= buf[k][25]; + buf[k][24] ^= buf[k][25]; + pre[4][k][i] ^= buf[k][24]; + buf[k][8] ^= buf[k][24]; + pre[0][k][i] ^= buf[k][8]; + buf[k][9] ^= buf[k][8]; + pre[1][k][i] ^= buf[k][9]; + buf[k][11] ^= buf[k][9]; + pre[0][k][i] ^= buf[k][11]; + buf[k][10] ^= buf[k][11]; + pre[2][k][i] ^= buf[k][10]; + buf[k][14] ^= buf[k][10]; + pre[0][k][i] ^= buf[k][14]; + buf[k][15] ^= buf[k][14]; + pre[1][k][i] ^= buf[k][15]; + buf[k][13] ^= buf[k][15]; + pre[0][k][i] ^= buf[k][13]; + buf[k][12] ^= buf[k][13]; + pre[3][k][i] ^= buf[k][12]; + buf[k][4] ^= buf[k][12]; + pre[0][k][i] ^= buf[k][4]; + buf[k][5] ^= buf[k][4]; + pre[1][k][i] ^= buf[k][5]; + buf[k][7] ^= buf[k][5]; + pre[0][k][i] ^= buf[k][7]; + buf[k][6] ^= buf[k][7]; + pre[2][k][i] ^= buf[k][6]; + buf[k][2] ^= buf[k][6]; + pre[0][k][i] ^= buf[k][2]; + buf[k][3] ^= buf[k][2]; + pre[1][k][i] ^= buf[k][3]; + buf[k][1] ^= buf[k][3]; + + pre[0][k][i] ^= buf[k][1]; + out[k][i] = buf[k][0] ^ buf[k][1]; + } + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128_VEC_vec_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128_VEC_vec_mul(out[2], pre[0][0], tmp); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(out[3], pre[0][1], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128_VEC_vec_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128_VEC_vec_mul(pre[i][0], pre[i][0], tmp); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(pre[i][1], pre[i][1], tmp); + + for (b = 0; b < GFBITS; b++) { + out[2][b] ^= pre[i][0][b]; + out[3][b] ^= pre[i][1][b]; + } + } + +} + +void PQCLEAN_MCELIECE6688128_VEC_fft_tr(vec out[][GFBITS], vec in[][ GFBITS ]) { + butterflies_tr(out, in); + + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece6688128/vec/fft_tr.h b/crypto_kem/mceliece6688128/vec/fft_tr.h new file mode 100644 index 00000000..670981a9 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_FFT_TR_H +#define PQCLEAN_MCELIECE6688128_VEC_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE6688128_VEC_fft_tr(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/gf.c b/crypto_kem/mceliece6688128/vec/gf.c new file mode 100644 index 00000000..2297ae41 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* field multiplication */ +gf PQCLEAN_MCELIECE6688128_VEC_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE6688128_VEC_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6688128_VEC_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6688128_VEC_gf_inv(gf in) { + return PQCLEAN_MCELIECE6688128_VEC_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6688128_VEC_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6688128_VEC_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6688128_VEC_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE6688128_VEC_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE6688128_VEC_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE6688128_VEC_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6688128/vec/gf.h b/crypto_kem/mceliece6688128/vec/gf.h new file mode 100644 index 00000000..30fa4572 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_GF_H +#define PQCLEAN_MCELIECE6688128_VEC_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6688128_VEC_gf_iszero(gf a); +gf PQCLEAN_MCELIECE6688128_VEC_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE6688128_VEC_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE6688128_VEC_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE6688128_VEC_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE6688128_VEC_gf_inv(gf in); + +void PQCLEAN_MCELIECE6688128_VEC_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/operations.c b/crypto_kem/mceliece6688128/vec/operations.c new file mode 100644 index 00000000..af2e6102 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6688128_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6688128_VEC_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6688128_VEC_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6688128_VEC_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6688128_VEC_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6688128_VEC_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6688128_VEC_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6688128_VEC_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6688128_VEC_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6688128_VEC_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6688128_VEC_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128/vec/params.h b/crypto_kem/mceliece6688128/vec/params.h new file mode 100644 index 00000000..b8d8b8af --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_PARAMS_H +#define PQCLEAN_MCELIECE6688128_VEC_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6688 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/pk_gen.c b/crypto_kem/mceliece6688128/vec/pk_gen.c new file mode 100644 index 00000000..7d5c0971 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/pk_gen.c @@ -0,0 +1,236 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec in[][GFBITS]) { + int i, j, r; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 0; r < 64; r++) { + out[i * 64 + r] <<= 1; + out[i * 64 + r] |= (in[i][j] >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec out0[][GFBITS], vec out1[][GFBITS], const uint64_t *in) { + int i, j, r; + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out1[i][j] <<= 1; + out1[i][j] |= (in[i * 64 + r] >> (j + GFBITS)) & 1; + } + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out0[i][GFBITS - 1 - j] <<= 1; + out0[i][GFBITS - 1 - j] |= (in[i * 64 + r] >> j) & 1; + } + } + } +} + +int PQCLEAN_MCELIECE6688128_VEC_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { +#define NBLOCKS_H ((SYS_N + 63) / 64) +#define NBLOCKS_I ((GFBITS * SYS_T + 63) / 64) + const int block_idx = NBLOCKS_I; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS_H ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS_I ]; + + uint64_t mask; + + vec irr_int[2][ GFBITS ]; + + vec consts[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + vec prod[ 128 ][ GFBITS ]; + vec tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE6688128_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6688128_VEC_fft(eval, irr_int); + + PQCLEAN_MCELIECE6688128_VEC_vec_copy(prod[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE6688128_VEC_vec_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128_VEC_vec_inv(tmp, prod[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE6688128_VEC_vec_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128_VEC_vec_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6688128_VEC_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS_I; j++) { + PQCLEAN_MCELIECE6688128_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS_I; j < NBLOCKS_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS_I; j < NBLOCKS_H; j++) { + PQCLEAN_MCELIECE6688128_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = 0; k < NBLOCKS_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS_H - 1; k++) { + PQCLEAN_MCELIECE6688128_VEC_store8(pk, one_row[k]); + pk += 8; + } + + PQCLEAN_MCELIECE6688128_VEC_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece6688128/vec/pk_gen.h b/crypto_kem/mceliece6688128/vec/pk_gen.h new file mode 100644 index 00000000..ee930895 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_PK_GEN_H +#define PQCLEAN_MCELIECE6688128_VEC_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE6688128_VEC_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/powers.inc b/crypto_kem/mceliece6688128/vec/powers.inc new file mode 100644 index 00000000..a9bd6179 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/powers.inc @@ -0,0 +1,1920 @@ +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +} diff --git a/crypto_kem/mceliece6688128/vec/scalars_2x.inc b/crypto_kem/mceliece6688128/vec/scalars_2x.inc new file mode 100644 index 00000000..a0abb162 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/scalars_2x.inc @@ -0,0 +1,150 @@ +{{ + 0X3C3CF30C0000C003, + 0X0CCCC3F333C0000C, + 0X03C33F33FCC0C03C, + 0X0003000F3C03C0C0, + 0XF33FF33030CF03F0, + 0X0CF0303300F0CCC0, + 0XFF3F0C0CC0FF3CC0, + 0XCF3CF0FF003FC000, + 0XC00FF3CF0303F300, + 0X3CCC0CC00CF0CC00, + 0XF30FFC3C3FCCFC00, + 0X3F0FC3F0CCF0C000, + 0X3000FF33CCF0F000 +}, +{ + 0X0C0F0FCF0F0CF330, + 0XF0000FC33C3CCF3C, + 0X3C0F3F00C3C300FC, + 0X3C33CCC0F0F3CC30, + 0XC0CFFFFFCCCC30CC, + 0X3FC3F3CCFFFC033F, + 0XFC3030CCCCC0CFCF, + 0X0FCF0C00CCF333C3, + 0XCFFCF33000CFF030, + 0X00CFFCC330F30FCC, + 0X3CCC3FCCC0F3FFF3, + 0XF00F0C3FC003C0FF, + 0X330CCFCC03C0FC33 +}}, +{{ + 0X0F0F0FF0F000000F, + 0X00FFFFFFFF0000F0, + 0XFFFF00FF00000F00, + 0XFFF000F00F0FF000, + 0XFFF0000F0FF000F0, + 0X00FF000FFF000000, + 0XFF0F0FFF0F0FF000, + 0X0FFF0000000F0000, + 0X00F000F0FFF00F00, + 0X00F00FF00F00F000, + 0XFFF000F000F00000, + 0X00F00F000FF00000, + 0X0000FF0F0000F000 +}, +{ + 0XF0FFFFFFF0F00F00, + 0X00FFF0FFFF0000FF, + 0X00FF00000F0F0FFF, + 0XF000F0000F00FF0F, + 0XFF000000FFF00000, + 0XF0FF000FF00F0FF0, + 0X0F0F0F00FF000F0F, + 0X0F0F00F0F0F0F000, + 0X00F00F00F00F000F, + 0X00F0F0F00000FFF0, + 0XFFFFFF0FF00F0FFF, + 0X0F0FFFF00FFFFFFF, + 0XFFFF0F0FFF0FFF00 +}}, +{{ + 0X00FF0000000000FF, + 0XFFFFFFFFFF00FF00, + 0XFF0000FF00FF0000, + 0XFFFF000000FF0000, + 0XFF00000000FF0000, + 0X00FFFFFFFF000000, + 0XFF0000FFFFFF0000, + 0XFF00FF00FFFF0000, + 0X00FFFFFFFF00FF00, + 0XFFFF000000000000, + 0X00FF0000FF000000, + 0XFF00FF00FF000000, + 0X00FF00FFFF000000 +}, +{ + 0X00FF00FF00FF0000, + 0XFF00FFFF000000FF, + 0X0000FFFF000000FF, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF00FF, + 0X0000FFFF00FFFF00, + 0XFF00FF0000FFFF00, + 0X00000000FFFFFFFF, + 0X0000FF0000000000, + 0XFF00FFFF00FFFF00, + 0X00FFFF00000000FF, + 0X0000FF00FF00FFFF, + 0XFF0000FFFFFF0000 +}}, +{{ + 0X000000000000FFFF, + 0XFFFFFFFFFFFF0000, + 0X0000000000000000, + 0XFFFF0000FFFF0000, + 0XFFFFFFFFFFFF0000, + 0X0000FFFF00000000, + 0X0000FFFFFFFF0000, + 0XFFFF0000FFFF0000, + 0X0000FFFF00000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000FFFF00000000, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0X0000FFFF00000000, + 0XFFFF0000FFFF0000, + 0X0000FFFFFFFF0000, + 0X0000FFFF0000FFFF, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFF0000, + 0XFFFF0000FFFFFFFF, + 0XFFFF0000FFFFFFFF, + 0X0000000000000000 +}}, +{{ + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000 +}} diff --git a/crypto_kem/mceliece6688128/vec/scalars_4x.inc b/crypto_kem/mceliece6688128/vec/scalars_4x.inc new file mode 100644 index 00000000..cbaccec7 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/scalars_4x.inc @@ -0,0 +1,360 @@ +{{ + 0x3C3CF30C0000C003, + 0x0CCCC3F333C0000C, + 0x03C33F33FCC0C03C, + 0x0003000F3C03C0C0, + 0xF33FF33030CF03F0, + 0x0CF0303300F0CCC0, + 0xFF3F0C0CC0FF3CC0, + 0xCF3CF0FF003FC000, + 0xC00FF3CF0303F300, + 0x3CCC0CC00CF0CC00, + 0xF30FFC3C3FCCFC00, + 0x3F0FC3F0CCF0C000, + 0x3000FF33CCF0F000 +}, +{ + 0x0C0F0FCF0F0CF330, + 0xF0000FC33C3CCF3C, + 0x3C0F3F00C3C300FC, + 0x3C33CCC0F0F3CC30, + 0xC0CFFFFFCCCC30CC, + 0x3FC3F3CCFFFC033F, + 0xFC3030CCCCC0CFCF, + 0x0FCF0C00CCF333C3, + 0xCFFCF33000CFF030, + 0x00CFFCC330F30FCC, + 0x3CCC3FCCC0F3FFF3, + 0xF00F0C3FC003C0FF, + 0x330CCFCC03C0FC33 +}, +{ + 0xF0F30C33CF03F03F, + 0x00F30FC00C3300FF, + 0xF3CC3CF3F3FCF33F, + 0x3C0FC0FC303C3F3C, + 0xFC30CF303F3FF00F, + 0x33300C0CC3300CF3, + 0x3C030CF3F03FF3F3, + 0x3CCC03FCCC3FFC03, + 0x033C3C3CF0003FC3, + 0xFFC0FF00F0FF0F03, + 0xF3F30CF003FCC303, + 0x30CFCFC3CC0F3000, + 0x0CF30CCF3FCFCC0F +}, +{ + 0x3F30CC0C000F3FCC, + 0xFC3CF030FC3FFF03, + 0x33FFFCFF0CCF3CC3, + 0x003CFF33C3CC30CF, + 0xCFF3CF33C00F3003, + 0x00F3CC0CF3003CCF, + 0x3C000CFCCC3C3333, + 0xF3CF03C0FCF03FF0, + 0x3F3C3CF0C330330C, + 0x33CCFCC0FF0033F0, + 0x33C300C0F0C003F3, + 0x003FF0003F00C00C, + 0xCFF3C3033F030FFF +}}, +{{ + 0x0F0F0FF0F000000F, + 0x00FFFFFFFF0000F0, + 0xFFFF00FF00000F00, + 0xFFF000F00F0FF000, + 0xFFF0000F0FF000F0, + 0x00FF000FFF000000, + 0xFF0F0FFF0F0FF000, + 0x0FFF0000000F0000, + 0x00F000F0FFF00F00, + 0x00F00FF00F00F000, + 0xFFF000F000F00000, + 0x00F00F000FF00000, + 0x0000FF0F0000F000 +}, +{ + 0xF0FFFFFFF0F00F00, + 0x00FFF0FFFF0000FF, + 0x00FF00000F0F0FFF, + 0xF000F0000F00FF0F, + 0xFF000000FFF00000, + 0xF0FF000FF00F0FF0, + 0x0F0F0F00FF000F0F, + 0x0F0F00F0F0F0F000, + 0x00F00F00F00F000F, + 0x00F0F0F00000FFF0, + 0xFFFFFF0FF00F0FFF, + 0x0F0FFFF00FFFFFFF, + 0xFFFF0F0FFF0FFF00 +}, +{ + 0x0F0F00FF0FF0FFFF, + 0xF000F0F00F00FF0F, + 0x000FFFF0FFF0FF0F, + 0x00F00FFF00000FF0, + 0xFFFFF0000FFFF00F, + 0xFFF0FFF0000FFFF0, + 0xF0F0F0000F0F0F00, + 0x00F000F0F00FFF00, + 0xF0FF0F0FFF00F0FF, + 0xF0FF0FFFF0F0F0FF, + 0x00FFFFFFFFFFFFF0, + 0x00FFF0F0FF000F0F, + 0x000FFFF0000FFF00 +}, +{ + 0xFF0F0F00F000F0FF, + 0x0FFFFFFFFF00000F, + 0xF0FFFF000F00F0FF, + 0x0F0000F00FFF0FFF, + 0x0F0F0F00FF0F000F, + 0x000F0F0FFFF0F000, + 0xF0FFFF0F00F0FF0F, + 0x0F0F000F0F00F0FF, + 0x0000F0FF00FF0F0F, + 0x00FFFF0FF0FFF0F0, + 0x0000000F00F0FFF0, + 0xF0F00000FF00F0F0, + 0x0F0F0FFFFFFFFFFF +}}, +{{ + 0x00FF0000000000FF, + 0xFFFFFFFFFF00FF00, + 0xFF0000FF00FF0000, + 0xFFFF000000FF0000, + 0xFF00000000FF0000, + 0x00FFFFFFFF000000, + 0xFF0000FFFFFF0000, + 0xFF00FF00FFFF0000, + 0x00FFFFFFFF00FF00, + 0xFFFF000000000000, + 0x00FF0000FF000000, + 0xFF00FF00FF000000, + 0x00FF00FFFF000000 +}, +{ + 0x00FF00FF00FF0000, + 0xFF00FFFF000000FF, + 0x0000FFFF000000FF, + 0x00FFFF00FF000000, + 0xFFFFFF0000FF00FF, + 0x0000FFFF00FFFF00, + 0xFF00FF0000FFFF00, + 0x00000000FFFFFFFF, + 0x0000FF0000000000, + 0xFF00FFFF00FFFF00, + 0x00FFFF00000000FF, + 0x0000FF00FF00FFFF, + 0xFF0000FFFFFF0000 +}, +{ + 0xFFFF00FF00FF00FF, + 0x00FFFF000000FF00, + 0xFFFF00FFFFFFFF00, + 0x0000FFFF00FFFFFF, + 0x00FF0000FF0000FF, + 0xFFFF0000FF00FFFF, + 0xFF000000FFFFFF00, + 0x000000000000FFFF, + 0xFF00FF00FFFF0000, + 0xFFFF00FFFF00FFFF, + 0xFFFFFFFFFF00FF00, + 0xFFFF00FFFF0000FF, + 0x0000FF00000000FF +}, +{ + 0xFF0000FFFFFF00FF, + 0xFFFF0000FFFFFFFF, + 0xFFFF000000FFFFFF, + 0x00FFFF00FF0000FF, + 0xFFFFFF00FFFFFF00, + 0x00FFFF00FFFF00FF, + 0x0000FFFF00FF0000, + 0x000000FFFF000000, + 0xFF00FF0000FF00FF, + 0x00FF0000000000FF, + 0xFF00FFFF00FF00FF, + 0xFFFFFFFFFFFFFFFF, + 0x0000FF000000FFFF +}}, +{{ + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0x0000000000000000, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFF0000, + 0x0000FFFF00000000, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFF0000, + 0x0000FFFF00000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000FFFF00000000, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000FFFF00000000, + 0xFFFF0000FFFF0000, + 0x0000FFFFFFFF0000, + 0x0000FFFF0000FFFF, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFF0000, + 0xFFFF0000FFFFFFFF, + 0xFFFF0000FFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF000000000000, + 0x0000FFFF00000000, + 0x00000000FFFF0000, + 0x0000FFFFFFFFFFFF, + 0x0000FFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0x000000000000FFFF, + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0xFFFFFFFF0000FFFF, + 0xFFFF0000FFFFFFFF +}, +{ + 0x0000FFFFFFFFFFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFFFFFF, + 0x00000000FFFF0000, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFF00000000, + 0xFFFFFFFF00000000, + 0x0000FFFFFFFF0000, + 0x0000FFFFFFFFFFFF +}}, +{{ + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000 +}, +{ + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000 +}}, +{{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF +}}, diff --git a/crypto_kem/mceliece6688128/vec/sk_gen.c b/crypto_kem/mceliece6688128/vec/sk_gen.c new file mode 100644 index 00000000..67448ae2 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6688128_VEC_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6688128_VEC_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6688128_VEC_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6688128_VEC_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6688128_VEC_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6688128_VEC_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6688128_VEC_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6688128_VEC_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128/vec/sk_gen.h b/crypto_kem/mceliece6688128/vec/sk_gen.h new file mode 100644 index 00000000..95bdeb71 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_SK_GEN_H +#define PQCLEAN_MCELIECE6688128_VEC_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6688128_VEC_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6688128_VEC_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/transpose.c b/crypto_kem/mceliece6688128/vec/transpose.c new file mode 100644 index 00000000..1afd25c1 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/transpose.c @@ -0,0 +1,35 @@ +#include "transpose.h" + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} diff --git a/crypto_kem/mceliece6688128/vec/transpose.h b/crypto_kem/mceliece6688128/vec/transpose.h new file mode 100644 index 00000000..3c080f1b --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/transpose.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_TRANSPOSE_H +#define PQCLEAN_MCELIECE6688128_VEC_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + + +void PQCLEAN_MCELIECE6688128_VEC_transpose_64x64(uint64_t *out, const uint64_t *in); + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/util.c b/crypto_kem/mceliece6688128/vec/util.c new file mode 100644 index 00000000..d1104b92 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/util.c @@ -0,0 +1,97 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ +#include "util.h" + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE6688128_VEC_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6688128_VEC_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6688128_VEC_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6688128_VEC_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6688128_VEC_irr_load(vec out[][GFBITS], const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6688128_VEC_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[0][i] = v[0]; + out[1][i] = v[1]; + } +} + +void PQCLEAN_MCELIECE6688128_VEC_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6688128_VEC_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} diff --git a/crypto_kem/mceliece6688128/vec/util.h b/crypto_kem/mceliece6688128/vec/util.h new file mode 100644 index 00000000..5e93e09f --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/util.h @@ -0,0 +1,27 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_UTIL_H +#define PQCLEAN_MCELIECE6688128_VEC_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE6688128_VEC_store_i(unsigned char *out, uint64_t in, int i); + +void PQCLEAN_MCELIECE6688128_VEC_store2(unsigned char *dest, uint16_t a); + +uint16_t PQCLEAN_MCELIECE6688128_VEC_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE6688128_VEC_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE6688128_VEC_irr_load(vec out[][GFBITS], const unsigned char *in); + +void PQCLEAN_MCELIECE6688128_VEC_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE6688128_VEC_load8(const unsigned char *in); + +#endif + diff --git a/crypto_kem/mceliece6688128/vec/vec.c b/crypto_kem/mceliece6688128/vec/vec.c new file mode 100644 index 00000000..085f8789 --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/vec.c @@ -0,0 +1,139 @@ +#include "vec.h" + +#include "params.h" + +vec PQCLEAN_MCELIECE6688128_VEC_vec_setbits(vec b) { + vec ret = -b; + + return ret; +} + +vec PQCLEAN_MCELIECE6688128_VEC_vec_set1_16b(uint16_t v) { + vec ret; + + ret = v; + ret |= ret << 16; + ret |= ret << 32; + + return ret; +} + +void PQCLEAN_MCELIECE6688128_VEC_vec_copy(vec *out, const vec *in) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i] = in[i]; + } +} + +vec PQCLEAN_MCELIECE6688128_VEC_vec_or_reduce(const vec *a) { + int i; + vec ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret |= a[i]; + } + + return ret; +} + +int PQCLEAN_MCELIECE6688128_VEC_vec_testz(vec a) { + a |= a >> 32; + a |= a >> 16; + a |= a >> 8; + a |= a >> 4; + a |= a >> 2; + a |= a >> 1; + + return ((int)a & 1) ^ 1; +} + + +void PQCLEAN_MCELIECE6688128_VEC_vec_mul(vec *h, const vec *f, const vec *g) { + int i, j; + vec buf[ 2 * GFBITS - 1 ]; + + for (i = 0; i < 2 * GFBITS - 1; i++) { + buf[i] = 0; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < GFBITS; j++) { + buf[i + j] ^= f[i] & g[j]; + } + } + + for (i = 2 * GFBITS - 2; i >= GFBITS; i--) { + buf[i - GFBITS + 4] ^= buf[i]; + buf[i - GFBITS + 3] ^= buf[i]; + buf[i - GFBITS + 1] ^= buf[i]; + buf[i - GFBITS + 0] ^= buf[i]; + } + + for (i = 0; i < GFBITS; i++) { + h[i] = buf[i]; + } +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE6688128_VEC_vec_sq(vec *out, const vec *in) { + int i; + vec result[GFBITS], t; + + t = in[11] ^ in[12]; + + result[0] = in[0] ^ in[11]; + result[1] = in[7] ^ t; + result[2] = in[1] ^ in[7]; + result[3] = in[8] ^ t; + result[4] = in[2] ^ in[7]; + result[4] = result[4] ^ in[8]; + result[4] = result[4] ^ t; + result[5] = in[7] ^ in[9]; + result[6] = in[3] ^ in[8]; + result[6] = result[6] ^ in[9]; + result[6] = result[6] ^ in[12]; + result[7] = in[8] ^ in[10]; + result[8] = in[4] ^ in[9]; + result[8] = result[8] ^ in[10]; + result[9] = in[9] ^ in[11]; + result[10] = in[5] ^ in[10]; + result[10] = result[10] ^ in[11]; + result[11] = in[10] ^ in[12]; + result[12] = in[6] ^ t; + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE6688128_VEC_vec_inv(vec *out, const vec *in) { + vec tmp_11[ GFBITS ]; + vec tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE6688128_VEC_vec_copy(out, in); + + PQCLEAN_MCELIECE6688128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE6688128_VEC_vec_sq(out, tmp_11); + PQCLEAN_MCELIECE6688128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE6688128_VEC_vec_sq(out, tmp_1111); + PQCLEAN_MCELIECE6688128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE6688128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128_VEC_vec_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE6688128_VEC_vec_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece6688128/vec/vec.h b/crypto_kem/mceliece6688128/vec/vec.h new file mode 100644 index 00000000..4c16b84c --- /dev/null +++ b/crypto_kem/mceliece6688128/vec/vec.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE6688128_VEC_VEC_H +#define PQCLEAN_MCELIECE6688128_VEC_VEC_H + +#include "params.h" + +#include + +typedef uint64_t vec; + +vec PQCLEAN_MCELIECE6688128_VEC_vec_setbits(vec b); + +vec PQCLEAN_MCELIECE6688128_VEC_vec_set1_16b(uint16_t v); + +void PQCLEAN_MCELIECE6688128_VEC_vec_copy(vec *out, const vec *in); + +vec PQCLEAN_MCELIECE6688128_VEC_vec_or_reduce(const vec *a); + +int PQCLEAN_MCELIECE6688128_VEC_vec_testz(vec a); + +void PQCLEAN_MCELIECE6688128_VEC_vec_mul(vec * /*h*/, const vec * /*f*/, const vec * /*g*/); +void PQCLEAN_MCELIECE6688128_VEC_vec_sq(vec * /*out*/, const vec * /*in*/); +void PQCLEAN_MCELIECE6688128_VEC_vec_inv(vec * /*out*/, const vec * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/META.yml b/crypto_kem/mceliece6688128f/META.yml new file mode 100644 index 00000000..510d17e9 --- /dev/null +++ b/crypto_kem/mceliece6688128f/META.yml @@ -0,0 +1,50 @@ +name: Classic McEliece 6688128 +type: kem +claimed-nist-level: 5 +claimed-security: IND-CCA2 +length-public-key: 1044992 +length-secret-key: 13892 +length-ciphertext: 240 +length-shared-secret: 32 +nistkat-sha256: a8270440cacaa34509c9cf24bd5c79cc58db774adcd65b2f98d46dcf8749f632 +principal-submitters: + - Daniel J. Bernstein + - Tung Chou + - Tanja Lange + - Ingo von Maurich + - Rafael Misoczki + - Ruben Niederhagen + - Edoardo Persichetti + - Christiane Peters + - Peter Schwabe + - Nicolas Sendrier + - Jakub Szefer + - Wen Wang +auxiliary-submitters: [] +implementations: + - name: clean + version: SUPERCOP-20191221 + - name: vec + version: SUPERCOP-20191221 + - name: sse + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - sse4_1 + - bmi1 + - popcnt + - name: avx + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi1 + - popcnt diff --git a/crypto_kem/mceliece6688128f/avx/LICENSE b/crypto_kem/mceliece6688128f/avx/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece6688128f/avx/Makefile b/crypto_kem/mceliece6688128f/avx/Makefile new file mode 100644 index 00000000..5431f4a2 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/Makefile @@ -0,0 +1,44 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece6688128f_avx.a + + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c int32_sort.c operations.c pk_gen.c sk_gen.c \ + transpose.c uint32_sort.c util.c vec128.c vec256.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S \ + transpose_64x256_sp_asm.S update_asm.S vec128_mul_asm.S \ + vec256_ama_asm.S vec256_maa_asm.S vec256_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h int32_sort.h \ + params.h pk_gen.h sk_gen.h transpose.h uint32_sort.h util.h vec128.h \ + vec256.h \ + consts.inc powers.inc scalars_2x.inc scalars_4x.inc + + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o int32_sort.o operations.o pk_gen.o sk_gen.o \ + transpose.o uint32_sort.o util.o vec128.o vec256.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + transpose_64x256_sp_asm.o update_asm.o vec128_mul_asm.o \ + vec256_ama_asm.o vec256_maa_asm.o vec256_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mbmi -mpopcnt -mavx2 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece6688128f/avx/aes256ctr.c b/crypto_kem/mceliece6688128f/avx/aes256ctr.c new file mode 100644 index 00000000..7f4c0216 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE6688128F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece6688128f/avx/aes256ctr.h b/crypto_kem/mceliece6688128f/avx/aes256ctr.h new file mode 100644 index 00000000..a5fd4dc4 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_AES256CTR_H +#define PQCLEAN_MCELIECE6688128F_AVX_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6688128F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6688128f/avx/api.h b/crypto_kem/mceliece6688128f/avx/api.h new file mode 100644 index 00000000..40dcac7a --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_API_H +#define PQCLEAN_MCELIECE6688128F_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_ALGNAME "Classic McEliece 6688128" +#define PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_PUBLICKEYBYTES 1044992 +#define PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_SECRETKEYBYTES 13892 +#define PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece6688128f/avx/benes.c b/crypto_kem/mceliece6688128f/avx/benes.c new file mode 100644 index 00000000..50f1c535 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE6688128F_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(PQCLEAN_MCELIECE6688128F_AVX_load8(ptr), PQCLEAN_MCELIECE6688128F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE6688128F_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(PQCLEAN_MCELIECE6688128F_AVX_load8(ptr), PQCLEAN_MCELIECE6688128F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6688128F_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece6688128f/avx/benes.h b/crypto_kem/mceliece6688128f/avx/benes.h new file mode 100644 index 00000000..905e5eac --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_BENES_H +#define PQCLEAN_MCELIECE6688128F_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE6688128F_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE6688128F_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/avx/bm.c b/crypto_kem/mceliece6688128f/avx/bm.c new file mode 100644 index 00000000..c9a95286 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/bm.c @@ -0,0 +1,214 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE6688128F_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE6688128F_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE6688128F_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE6688128F_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE6688128F_AVX_vec256_or(PQCLEAN_MCELIECE6688128F_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE6688128F_AVX_vec256_sll_4x(PQCLEAN_MCELIECE6688128F_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE6688128F_AVX_vec256_or(PQCLEAN_MCELIECE6688128F_AVX_vec256_srl_4x(PQCLEAN_MCELIECE6688128F_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE6688128F_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6688128F_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + vec128 db[ GFBITS ][ 2 ]; + vec128 BC_tmp[ GFBITS ][ 2 ]; + vec128 BC[ GFBITS ][ 2 ]; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC[0][0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0, one << 63); + BC[0][1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setzero(); + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setzero(); + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm(prod, interval, BC[0] + 1, 32); + PQCLEAN_MCELIECE6688128F_AVX_update_asm(interval, coefs[N], 16); + + d = PQCLEAN_MCELIECE6688128F_AVX_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE6688128F_AVX_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits((d >> i) & 1); + db[i][1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul((vec256 *) BC_tmp, (vec256 *) db, (vec256 *) BC); + + vec128_cmov(BC, mask); + PQCLEAN_MCELIECE6688128F_AVX_update_asm(BC, c0 & mask, 32); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(BC_tmp[i][0], BC_tmp[i][1]); + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE6688128F_AVX_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + prod[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm(out, prod, BC[0] + 1, 32); +} + diff --git a/crypto_kem/mceliece6688128f/avx/bm.h b/crypto_kem/mceliece6688128f/avx/bm.h new file mode 100644 index 00000000..a4b6d9b2 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_BM_H +#define PQCLEAN_MCELIECE6688128F_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6688128F_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/avx/consts.S b/crypto_kem/mceliece6688128f/avx/consts.S new file mode 100644 index 00000000..f9127c38 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE6688128F_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE6688128F_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE6688128F_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE6688128F_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE6688128F_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE6688128F_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE6688128F_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE6688128F_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE6688128F_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE6688128F_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE6688128F_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE6688128F_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece6688128f/avx/consts.inc b/crypto_kem/mceliece6688128f/avx/consts.inc new file mode 100644 index 00000000..0415c18e --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/crypto_kem/mceliece6688128f/avx/controlbits.c b/crypto_kem/mceliece6688128f/avx/controlbits.c new file mode 100644 index 00000000..0a015282 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6688128F_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6688128F_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6688128F_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6688128F_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6688128f/avx/controlbits.h b/crypto_kem/mceliece6688128f/avx/controlbits.h new file mode 100644 index 00000000..90f8ea6e --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE6688128F_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6688128F_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6688128F_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6688128f/avx/crypto_hash.h b/crypto_kem/mceliece6688128f/avx/crypto_hash.h new file mode 100644 index 00000000..b8783575 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6688128f/avx/decrypt.c b/crypto_kem/mceliece6688128f/avx/decrypt.c new file mode 100644 index 00000000..5e29bfc0 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/decrypt.c @@ -0,0 +1,234 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE6688128F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6688128F_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6688128F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE6688128F_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE6688128F_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE6688128F_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6688128F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec256_or(diff, PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE6688128F_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6688128F_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 32 ][ GFBITS ]; + vec256 scaled[ 32 ][ GFBITS ]; + vec256 eval[32][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE6688128F_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE6688128F_AVX_benes(recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE6688128F_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6688128F_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE6688128F_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE6688128F_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE6688128F_AVX_benes(error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece6688128f/avx/decrypt.h b/crypto_kem/mceliece6688128f/avx/decrypt.h new file mode 100644 index 00000000..229dce4b --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE6688128F_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6688128F_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/avx/encrypt.c b/crypto_kem/mceliece6688128f/avx/encrypt.c new file mode 100644 index 00000000..3859648e --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/encrypt.c @@ -0,0 +1,104 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE6688128F_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE6688128F_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6688128F_AVX_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6688128F_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + PQCLEAN_MCELIECE6688128F_AVX_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece6688128f/avx/encrypt.h b/crypto_kem/mceliece6688128f/avx/encrypt.h new file mode 100644 index 00000000..ccc75600 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE6688128F_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6688128F_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/avx/fft.c b/crypto_kem/mceliece6688128f/avx/fft.c new file mode 100644 index 00000000..344f9dd0 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/fft.c @@ -0,0 +1,275 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec256 powers[ 32 ][ GFBITS ] = { +#include "powers.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + + // transpose + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 32; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(out[i][b], powers[i][b]); + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6688128F_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece6688128f/avx/fft.h b/crypto_kem/mceliece6688128f/avx/fft.h new file mode 100644 index 00000000..5ecdf44a --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_FFT_H +#define PQCLEAN_MCELIECE6688128F_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE6688128F_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/avx/fft_tr.c b/crypto_kem/mceliece6688128f/avx/fft_tr.c new file mode 100644 index 00000000..504d6a01 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/fft_tr.c @@ -0,0 +1,379 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE6688128F_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE6688128F_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +void PQCLEAN_MCELIECE6688128F_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece6688128f/avx/fft_tr.h b/crypto_kem/mceliece6688128f/avx/fft_tr.h new file mode 100644 index 00000000..210cc82b --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE6688128F_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6688128F_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6688128f/avx/gf.c b/crypto_kem/mceliece6688128f/avx/gf.c new file mode 100644 index 00000000..bba70c12 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* field multiplication */ +gf PQCLEAN_MCELIECE6688128F_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE6688128F_AVX_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6688128F_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6688128F_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE6688128F_AVX_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6688128F_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6688128F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6688128F_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE6688128F_AVX_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE6688128F_AVX_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE6688128F_AVX_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6688128f/avx/gf.h b/crypto_kem/mceliece6688128f/avx/gf.h new file mode 100644 index 00000000..e92a9c54 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/gf.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_GF_H +#define PQCLEAN_MCELIECE6688128F_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6688128F_AVX_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE6688128F_AVX_gf_mul(gf /*in0*/, gf /*in1*/); +uint64_t PQCLEAN_MCELIECE6688128F_AVX_gf_mul2(gf /*a*/, gf /*b0*/, gf /*b1*/); +gf PQCLEAN_MCELIECE6688128F_AVX_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE6688128F_AVX_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE6688128F_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/avx/int32_sort.c b/crypto_kem/mceliece6688128f/avx/int32_sort.c new file mode 100644 index 00000000..f01b002e --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/int32_sort.c @@ -0,0 +1,1208 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = (p << 1 == q); + flipflip = !flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE6688128F_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE6688128F_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/crypto_kem/mceliece6688128f/avx/int32_sort.h b/crypto_kem/mceliece6688128f/avx/int32_sort.h new file mode 100644 index 00000000..fa408e94 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE6688128F_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE6688128F_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece6688128f/avx/operations.c b/crypto_kem/mceliece6688128f/avx/operations.c new file mode 100644 index 00000000..be8c6bcf --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6688128F_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6688128F_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6688128F_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6688128F_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6688128F_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6688128F_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6688128F_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6688128F_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6688128F_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6688128F_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128f/avx/params.h b/crypto_kem/mceliece6688128f/avx/params.h new file mode 100644 index 00000000..9cbe21f4 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_PARAMS_H +#define PQCLEAN_MCELIECE6688128F_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6688 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6688128f/avx/pk_gen.c b/crypto_kem/mceliece6688128f/avx/pk_gen.c new file mode 100644 index 00000000..93432c0f --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/pk_gen.c @@ -0,0 +1,360 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 255) / 256) * 4 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << 32 >> 32) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> 32 << 32) | (buf[j] >> 32); + } + } + + return 0; +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE6688128F_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE6688128F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6688128F_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE6688128F_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6688128F_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = NBLOCKS1_I; j < NBLOCKS1_H - 1; j++) { + PQCLEAN_MCELIECE6688128F_AVX_store8(pk, mat[i][j]); + pk += 8; + } + + PQCLEAN_MCELIECE6688128F_AVX_store_i(pk, mat[i][j], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece6688128f/avx/pk_gen.h b/crypto_kem/mceliece6688128f/avx/pk_gen.h new file mode 100644 index 00000000..923c3a2e --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE6688128F_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6688128F_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/avx/powers.inc b/crypto_kem/mceliece6688128f/avx/powers.inc new file mode 100644 index 00000000..aa25d709 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/powers.inc @@ -0,0 +1,480 @@ +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +} diff --git a/crypto_kem/mceliece6688128f/avx/scalars_2x.inc b/crypto_kem/mceliece6688128f/avx/scalars_2x.inc new file mode 100644 index 00000000..42d6b9f0 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece6688128f/avx/scalars_4x.inc b/crypto_kem/mceliece6688128f/avx/scalars_4x.inc new file mode 100644 index 00000000..573f6090 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/crypto_kem/mceliece6688128f/avx/sk_gen.c b/crypto_kem/mceliece6688128f/avx/sk_gen.c new file mode 100644 index 00000000..21937714 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6688128F_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6688128F_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6688128F_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6688128F_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6688128F_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6688128F_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6688128F_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6688128F_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128f/avx/sk_gen.h b/crypto_kem/mceliece6688128f/avx/sk_gen.h new file mode 100644 index 00000000..c87cf34b --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE6688128F_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6688128F_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6688128F_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/avx/syndrome_asm.S b/crypto_kem/mceliece6688128f/avx/syndrome_asm.S new file mode 100644 index 00000000..1b9d6d17 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/syndrome_asm.S @@ -0,0 +1,810 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128F_AVX_syndrome_asm +.global PQCLEAN_MCELIECE6688128F_AVX_syndrome_asm +_PQCLEAN_MCELIECE6688128F_AVX_syndrome_asm: +PQCLEAN_MCELIECE6688128F_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 1044364 +# asm 1: add $1044364,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1664 +# asm 1: mov $1664,>row=int64#5 +# asm 2: mov $1664,>row=%r8 +mov $1664,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 208 ] +# asm 1: vmovupd 208(ee=reg256#2 +# asm 2: vmovupd 208(ee=%ymm1 +vmovupd 208(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 240 ] +# asm 1: vmovupd 240(ee=reg256#3 +# asm 2: vmovupd 240(ee=%ymm2 +vmovupd 240(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 272 ] +# asm 1: vmovupd 272(ee=reg256#3 +# asm 2: vmovupd 272(ee=%ymm2 +vmovupd 272(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 304 ] +# asm 1: vmovupd 304(ee=reg256#3 +# asm 2: vmovupd 304(ee=%ymm2 +vmovupd 304(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 336 ] +# asm 1: vmovupd 336(ee=reg256#3 +# asm 2: vmovupd 336(ee=%ymm2 +vmovupd 336(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 368 ] +# asm 1: vmovupd 368(ee=reg256#3 +# asm 2: vmovupd 368(ee=%ymm2 +vmovupd 368(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 400 ] +# asm 1: vmovupd 400(ee=reg256#3 +# asm 2: vmovupd 400(ee=%ymm2 +vmovupd 400(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 432 ] +# asm 1: vmovupd 432(ee=reg256#3 +# asm 2: vmovupd 432(ee=%ymm2 +vmovupd 432(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 464 ] +# asm 1: vmovupd 464(ee=reg256#3 +# asm 2: vmovupd 464(ee=%ymm2 +vmovupd 464(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 496 ] +# asm 1: vmovupd 496(ee=reg256#3 +# asm 2: vmovupd 496(ee=%ymm2 +vmovupd 496(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 528 ] +# asm 1: vmovupd 528(ee=reg256#3 +# asm 2: vmovupd 528(ee=%ymm2 +vmovupd 528(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 560 ] +# asm 1: vmovupd 560(ee=reg256#3 +# asm 2: vmovupd 560(ee=%ymm2 +vmovupd 560(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 592 ] +# asm 1: vmovupd 592(ee=reg256#3 +# asm 2: vmovupd 592(ee=%ymm2 +vmovupd 592(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 416(pp=%ymm1 +vmovupd 416(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 624 ] +# asm 1: vmovupd 624(ee=reg256#3 +# asm 2: vmovupd 624(ee=%ymm2 +vmovupd 624(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 448(pp=%ymm1 +vmovupd 448(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 656 ] +# asm 1: vmovupd 656(ee=reg256#3 +# asm 2: vmovupd 656(ee=%ymm2 +vmovupd 656(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 480(pp=%ymm1 +vmovupd 480(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 688 ] +# asm 1: vmovupd 688(ee=reg256#3 +# asm 2: vmovupd 688(ee=%ymm2 +vmovupd 688(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 512(pp=%ymm1 +vmovupd 512(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 720 ] +# asm 1: vmovupd 720(ee=reg256#3 +# asm 2: vmovupd 720(ee=%ymm2 +vmovupd 720(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 544(pp=%ymm1 +vmovupd 544(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 752 ] +# asm 1: vmovupd 752(ee=reg256#3 +# asm 2: vmovupd 752(ee=%ymm2 +vmovupd 752(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 576(pp=%ymm1 +vmovupd 576(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 784 ] +# asm 1: vmovupd 784(ee=reg256#3 +# asm 2: vmovupd 784(ee=%ymm2 +vmovupd 784(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = mem64[input_1 + 608] +# asm 1: movq 608(s=int64#6 +# asm 2: movq 608(s=%r9 +movq 608(%rsi),%r9 + +# qhasm: e = mem64[input_2 + 816] +# asm 1: movq 816(e=int64#7 +# asm 2: movq 816(e=%rax +movq 816(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 616(p=%rax +movq 616(%rsi),%rax + +# qhasm: e = mem64[input_2 + 824] +# asm 1: movq 824(e=int64#8 +# asm 2: movq 824(e=%r10 +movq 824(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and p=int64#7d +# asm 2: movl 624(p=%eax +movl 624(%rsi),%eax + +# qhasm: e = *(uint32 *)(input_2 + 832) +# asm 1: movl 832(e=int64#8d +# asm 2: movl 832(e=%r10d +movl 832(%rdx),%r10d + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 96(ss=%ymm0 +vmovupd 96(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 128(ss=%ymm0 +vmovupd 128(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#2 +# asm 2: vmovupd 128(ee=%ymm1 +vmovupd 128(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 160(ss=%ymm0 +vmovupd 160(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#2 +# asm 2: vmovupd 160(ee=%ymm1 +vmovupd 160(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor s=int64#2 +# asm 2: movq 192(s=%rsi +movq 192(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 192 ] +# asm 1: movq 192(e=int64#4 +# asm 2: movq 192(e=%rcx +movq 192(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 200(s=%rsi +movq 200(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 200 ] +# asm 1: movq 200(e=int64#3 +# asm 2: movq 200(e=%rdx +movq 200(%rdx),%rdx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE6688128F_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece6688128f/avx/update_asm.S b/crypto_kem/mceliece6688128f/avx/update_asm.S new file mode 100644 index 00000000..ee626daf --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128F_AVX_update_asm +.global PQCLEAN_MCELIECE6688128F_AVX_update_asm +_PQCLEAN_MCELIECE6688128F_AVX_update_asm: +PQCLEAN_MCELIECE6688128F_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE6688128F_AVX_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6688128F_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6688128F_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6688128F_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6688128F_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6688128F_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6688128F_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6688128F_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x( PQCLEAN_MCELIECE6688128F_AVX_load8(in), PQCLEAN_MCELIECE6688128F_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE6688128F_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE6688128F_AVX_store8(out + 0, PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE6688128F_AVX_store8(out + 8, PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece6688128f/avx/util.h b/crypto_kem/mceliece6688128f/avx/util.h new file mode 100644 index 00000000..e206e40a --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_UTIL_H +#define PQCLEAN_MCELIECE6688128F_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE6688128F_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE6688128F_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE6688128F_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE6688128F_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE6688128F_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE6688128F_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE6688128F_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE6688128F_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE6688128F_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece6688128f/avx/vec128.c b/crypto_kem/mceliece6688128f/avx/vec128.c new file mode 100644 index 00000000..16533dc3 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE6688128F_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE6688128F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE6688128F_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/crypto_kem/mceliece6688128f/avx/vec128.h b/crypto_kem/mceliece6688128f/avx/vec128.h new file mode 100644 index 00000000..afc46d46 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_VEC128_H +#define PQCLEAN_MCELIECE6688128F_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE6688128F_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE6688128F_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE6688128F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/crypto_kem/mceliece6688128f/avx/vec128_mul_asm.S b/crypto_kem/mceliece6688128f/avx/vec128_mul_asm.S new file mode 100644 index 00000000..e97f1d61 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE6688128F_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE6688128F_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE6688128F_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE6688128F_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE6688128F_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/crypto_kem/mceliece6688128f/avx/vec256_ama_asm.S b/crypto_kem/mceliece6688128f/avx/vec256_ama_asm.S new file mode 100644 index 00000000..043839c1 --- /dev/null +++ b/crypto_kem/mceliece6688128f/avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6688128F_CLEAN_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6688128f/clean/api.h b/crypto_kem/mceliece6688128f/clean/api.h new file mode 100644 index 00000000..c4c6025d --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_API_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_API_H + +#include + +#define PQCLEAN_MCELIECE6688128F_CLEAN_CRYPTO_ALGNAME "Classic McEliece 6688128" +#define PQCLEAN_MCELIECE6688128F_CLEAN_CRYPTO_PUBLICKEYBYTES 1044992 +#define PQCLEAN_MCELIECE6688128F_CLEAN_CRYPTO_SECRETKEYBYTES 13892 +#define PQCLEAN_MCELIECE6688128F_CLEAN_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE6688128F_CLEAN_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6688128F_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6688128F_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6688128F_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece6688128f/clean/benes.c b/crypto_kem/mceliece6688128f/clean/benes.c new file mode 100644 index 00000000..5cdad3c3 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/benes.c @@ -0,0 +1,180 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6688128F_CLEAN_apply_benes(unsigned char *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + unsigned char *r_ptr = r; + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = PQCLEAN_MCELIECE6688128F_CLEAN_load8(r_ptr + i * 16 + 0); + r_int_v[1][i] = PQCLEAN_MCELIECE6688128F_CLEAN_load8(r_ptr + i * 16 + 8); + } + + PQCLEAN_MCELIECE6688128F_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6688128F_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6688128F_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6688128F_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6688128F_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE6688128F_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6688128F_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6688128F_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6688128F_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6688128F_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE6688128F_CLEAN_store8(r_ptr + i * 16 + 0, r_int_v[0][i]); + PQCLEAN_MCELIECE6688128F_CLEAN_store8(r_ptr + i * 16 + 8, r_int_v[1][i]); + } +} + +/* input: condition bits c */ +/* output: support s */ +void PQCLEAN_MCELIECE6688128F_CLEAN_support_gen(gf *s, const unsigned char *c) { + gf a; + int i, j; + unsigned char L[ GFBITS ][ (1 << GFBITS) / 8 ]; + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < (1 << GFBITS) / 8; j++) { + L[i][j] = 0; + } + } + + for (i = 0; i < (1 << GFBITS); i++) { + a = PQCLEAN_MCELIECE6688128F_CLEAN_bitrev((gf) i); + + for (j = 0; j < GFBITS; j++) { + L[j][ i / 8 ] |= ((a >> j) & 1) << (i % 8); + } + } + + for (j = 0; j < GFBITS; j++) { + PQCLEAN_MCELIECE6688128F_CLEAN_apply_benes(L[j], c, 0); + } + + for (i = 0; i < SYS_N; i++) { + s[i] = 0; + for (j = GFBITS - 1; j >= 0; j--) { + s[i] <<= 1; + s[i] |= (L[j][i / 8] >> (i % 8)) & 1; + } + } +} + diff --git a/crypto_kem/mceliece6688128f/clean/benes.h b/crypto_kem/mceliece6688128f/clean/benes.h new file mode 100644 index 00000000..57c72d43 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/benes.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_BENES_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_BENES_H +/* + This file is for Benes network related functions +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE6688128F_CLEAN_apply_benes(uint8_t *r, const uint8_t *bits, int rev); +void PQCLEAN_MCELIECE6688128F_CLEAN_support_gen(gf *s, const uint8_t *c); + +#endif + diff --git a/crypto_kem/mceliece6688128f/clean/bm.c b/crypto_kem/mceliece6688128f/clean/bm.c new file mode 100644 index 00000000..4217709c --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/bm.c @@ -0,0 +1,83 @@ +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ +#include "bm.h" + +#include "params.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +/* the Berlekamp-Massey algorithm */ +/* input: s, sequence of field elements */ +/* output: out, minimal polynomial of s */ +void PQCLEAN_MCELIECE6688128F_CLEAN_bm(gf *out, gf *s) { + int i; + + uint16_t N = 0; + uint16_t L = 0; + uint16_t mle; + uint16_t mne; + + gf T[ SYS_T + 1 ]; + gf C[ SYS_T + 1 ]; + gf B[ SYS_T + 1 ]; + + gf b = 1, d, f; + + // + + for (i = 0; i < SYS_T + 1; i++) { + C[i] = B[i] = 0; + } + + B[1] = C[0] = 1; + + // + + for (N = 0; N < 2 * SYS_T; N++) { + d = 0; + + for (i = 0; i <= min(N, SYS_T); i++) { + d ^= PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(C[i], s[ N - i]); + } + + mne = d; + mne -= 1; + mne >>= 15; + mne -= 1; + mle = N; + mle -= 2 * L; + mle >>= 15; + mle -= 1; + mle &= mne; + + for (i = 0; i <= SYS_T; i++) { + T[i] = C[i]; + } + + f = PQCLEAN_MCELIECE6688128F_CLEAN_gf_frac(b, d); + + for (i = 0; i <= SYS_T; i++) { + C[i] ^= PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(f, B[i]) & mne; + } + + L = (L & ~mle) | ((N + 1 - L) & mle); + + for (i = 0; i <= SYS_T; i++) { + B[i] = (B[i] & ~mle) | (T[i] & mle); + } + + b = (b & ~mle) | (d & mle); + + for (i = SYS_T; i >= 1; i--) { + B[i] = B[i - 1]; + } + B[0] = 0; + } + + for (i = 0; i <= SYS_T; i++) { + out[i] = C[ SYS_T - i ]; + } +} + diff --git a/crypto_kem/mceliece6688128f/clean/bm.h b/crypto_kem/mceliece6688128f/clean/bm.h new file mode 100644 index 00000000..cddb1b6c --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/bm.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_BM_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_BM_H +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE6688128F_CLEAN_bm(gf * /*out*/, gf * /*s*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/clean/controlbits.c b/crypto_kem/mceliece6688128f/clean/controlbits.c new file mode 100644 index 00000000..cd603733 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6688128F_CLEAN_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6688128F_CLEAN_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6688128F_CLEAN_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6688128F_CLEAN_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6688128f/clean/controlbits.h b/crypto_kem/mceliece6688128f/clean/controlbits.h new file mode 100644 index 00000000..a85d256e --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_CONTROLBITS_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6688128F_CLEAN_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6688128F_CLEAN_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6688128f/clean/crypto_hash.h b/crypto_kem/mceliece6688128f/clean/crypto_hash.h new file mode 100644 index 00000000..e77415ce --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6688128f/clean/decrypt.c b/crypto_kem/mceliece6688128f/clean/decrypt.c new file mode 100644 index 00000000..fcf7593a --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/decrypt.c @@ -0,0 +1,90 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "gf.h" +#include "params.h" +#include "root.h" +#include "synd.h" +#include "util.h" + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6688128F_CLEAN_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i, w = 0; + uint16_t check; + + unsigned char r[ SYS_N / 8 ]; + + gf g[ SYS_T + 1 ]; + gf L[ SYS_N ]; + + gf s[ SYS_T * 2 ]; + gf s_cmp[ SYS_T * 2 ]; + gf locator[ SYS_T + 1 ]; + gf images[ SYS_N ]; + + gf t; + + // + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = c[i]; + } + for (i = SYND_BYTES; i < SYS_N / 8; i++) { + r[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE6688128F_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + g[ SYS_T ] = 1; + + PQCLEAN_MCELIECE6688128F_CLEAN_support_gen(L, sk); + + PQCLEAN_MCELIECE6688128F_CLEAN_synd(s, g, L, r); + + PQCLEAN_MCELIECE6688128F_CLEAN_bm(locator, s); + + PQCLEAN_MCELIECE6688128F_CLEAN_root(images, locator, L); + + // + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + } + + for (i = 0; i < SYS_N; i++) { + t = PQCLEAN_MCELIECE6688128F_CLEAN_gf_iszero(images[i]) & 1; + + e[ i / 8 ] |= t << (i % 8); + w += t; + + } + + PQCLEAN_MCELIECE6688128F_CLEAN_synd(s_cmp, g, L, e); + + // + + check = (uint16_t)w; + check ^= SYS_T; + + for (i = 0; i < SYS_T * 2; i++) { + check |= s[i] ^ s_cmp[i]; + } + + check -= 1; + check >>= 15; + + return check ^ 1; +} + diff --git a/crypto_kem/mceliece6688128f/clean/decrypt.h b/crypto_kem/mceliece6688128f/clean/decrypt.h new file mode 100644 index 00000000..cd90538f --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_DECRYPT_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6688128F_CLEAN_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/clean/encrypt.c b/crypto_kem/mceliece6688128f/clean/encrypt.c new file mode 100644 index 00000000..6dc9ae98 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/encrypt.c @@ -0,0 +1,138 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include + +#include "gf.h" + +static inline uint8_t same_mask(uint16_t x, uint16_t y) { + uint32_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 31; + mask = -mask; + + return (uint8_t)mask; +} + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint8_t *ind_8 = (uint8_t *)ind_; + uint16_t ind[ SYS_T * 2 ]; + uint8_t mask; + unsigned char val[ SYS_T ]; + + while (1) { + randombytes(ind_8, sizeof(ind_)); + // Copy to uint16_t ind_ in a little-endian way + for (i = 0; i < sizeof(ind_); i += 2) { + ind_[i / 2] = ((uint16_t)ind_8[i + 1]) << 8 | (uint16_t)ind_8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = 1 << (ind[j] & 7); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = same_mask((uint16_t)i, (ind[j] >> 3)); + + e[i] |= val[j] & mask; + } + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + unsigned char b, row[SYS_N / 8]; + const unsigned char *pk_ptr = pk; + + int i, j; + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = 0; + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + row[j] = 0; + } + + for (j = 0; j < PK_ROW_BYTES; j++) { + row[ SYS_N / 8 - PK_ROW_BYTES + j ] = pk_ptr[j]; + } + + row[i / 8] |= 1 << (i % 8); + + b = 0; + for (j = 0; j < SYS_N / 8; j++) { + b ^= row[j] & e[j]; + } + + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] |= (b << (i % 8)); + + pk_ptr += PK_ROW_BYTES; + } +} + +void PQCLEAN_MCELIECE6688128F_CLEAN_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece6688128f/clean/encrypt.h b/crypto_kem/mceliece6688128f/clean/encrypt.h new file mode 100644 index 00000000..aae2b033 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_ENCRYPT_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6688128F_CLEAN_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/clean/gf.c b/crypto_kem/mceliece6688128f/clean/gf.c new file mode 100644 index 00000000..173e65b1 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/gf.c @@ -0,0 +1,210 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE6688128F_CLEAN_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE6688128F_CLEAN_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* input: field element in */ +/* return: (in^2)^2 */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: (in^2)*m */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: ((in^2)^2)*m */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE6688128F_CLEAN_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +gf PQCLEAN_MCELIECE6688128F_CLEAN_gf_inv(gf in) { + return PQCLEAN_MCELIECE6688128F_CLEAN_gf_frac(in, ((gf) 1)); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE6688128F_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(prod[i], (gf) 7682); + prod[i - SYS_T + 3] ^= PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(prod[i], (gf) 2159); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6688128f/clean/gf.h b/crypto_kem/mceliece6688128f/clean/gf.h new file mode 100644 index 00000000..c4d3709b --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_GF_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6688128F_CLEAN_gf_iszero(gf a); +gf PQCLEAN_MCELIECE6688128F_CLEAN_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(gf in0, gf in1); +gf PQCLEAN_MCELIECE6688128F_CLEAN_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE6688128F_CLEAN_gf_inv(gf in); +uint64_t PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul2(gf a, gf b0, gf b1); + +void PQCLEAN_MCELIECE6688128F_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece6688128f/clean/operations.c b/crypto_kem/mceliece6688128f/clean/operations.c new file mode 100644 index 00000000..18f6a5b1 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6688128F_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6688128F_CLEAN_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128F_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6688128F_CLEAN_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128F_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6688128F_CLEAN_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6688128F_CLEAN_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6688128F_CLEAN_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6688128F_CLEAN_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6688128F_CLEAN_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6688128F_CLEAN_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6688128F_CLEAN_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6688128F_CLEAN_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128f/clean/params.h b/crypto_kem/mceliece6688128f/clean/params.h new file mode 100644 index 00000000..bdd1f9fd --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_PARAMS_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6688 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6688128f/clean/pk_gen.c b/crypto_kem/mceliece6688128f/clean/pk_gen.c new file mode 100644 index 00000000..324e9fee --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/pk_gen.c @@ -0,0 +1,294 @@ +/* + This file is for public-key generation +*/ + +#include +#include +#include +#include + +#include "controlbits.h" +#include "benes.h" +#include "params.h" +#include "pk_gen.h" +#include "root.h" +#include "util.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + int i, b, m = 0, r = 0; + + for (i = 0; i < 64; i++) { + b = (int)(in >> i) & 1; + m |= b; + r += (m ^ 1) & (b ^ 1); + } + + return r; +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint8_t mat[][ SYS_N / 8 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 8; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = PQCLEAN_MCELIECE6688128F_CLEAN_load8( &mat[ row + i ][ block_idx ] ); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = PQCLEAN_MCELIECE6688128F_CLEAN_load8( &mat[ i + j ][ block_idx ] ); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + PQCLEAN_MCELIECE6688128F_CLEAN_store8( &mat[ i + j ][ block_idx ], buf[j] ); + } + } + + return 0; +} + +/* input: secret key sk */ +/* output: public key pk */ +int PQCLEAN_MCELIECE6688128F_CLEAN_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) { + int i, j, k; + int row, c; + + uint64_t buf[ 1 << GFBITS ]; + + unsigned char mat[ GFBITS * SYS_T ][ SYS_N / 8 ]; + unsigned char mask; + unsigned char b; + + gf g[ SYS_T + 1 ]; // Goppa polynomial + gf L[ SYS_N ]; // support + gf inv[ SYS_N ]; + + // + + g[ SYS_T ] = 1; + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE6688128F_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + + for (i = 0; i < (1 << GFBITS); i++) { + buf[i] = perm[i]; + buf[i] <<= 31; + buf[i] |= i; + } + + PQCLEAN_MCELIECE6688128F_CLEAN_sort_63b(1 << GFBITS, buf); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = buf[i] & GFMASK; + } + for (i = 0; i < SYS_N; i++) { + L[i] = PQCLEAN_MCELIECE6688128F_CLEAN_bitrev((gf)perm[i]); + } + + // filling the matrix + + PQCLEAN_MCELIECE6688128F_CLEAN_root(inv, g, L); + + for (i = 0; i < SYS_N; i++) { + inv[i] = PQCLEAN_MCELIECE6688128F_CLEAN_gf_inv(inv[i]); + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + mat[i][j] = 0; + } + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_N; j += 8) { + for (k = 0; k < GFBITS; k++) { + b = (inv[j + 7] >> k) & 1; + b <<= 1; + b |= (inv[j + 6] >> k) & 1; + b <<= 1; + b |= (inv[j + 5] >> k) & 1; + b <<= 1; + b |= (inv[j + 4] >> k) & 1; + b <<= 1; + b |= (inv[j + 3] >> k) & 1; + b <<= 1; + b |= (inv[j + 2] >> k) & 1; + b <<= 1; + b |= (inv[j + 1] >> k) & 1; + b <<= 1; + b |= (inv[j + 0] >> k) & 1; + + mat[ i * GFBITS + k ][ j / 8 ] = b; + } + } + + for (j = 0; j < SYS_N; j++) { + inv[j] = PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(inv[j], L[j]); + } + + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) { + for (j = 0; j < 8; j++) { + row = i * 8 + j; + + if (row >= GFBITS * SYS_T) { + break; + } + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] ^ mat[ k ][ i ]; + mask >>= j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < GFBITS * SYS_T; k++) { + if (k != row) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + } + + for (i = 0; i < PK_NROWS; i++) { + memcpy(pk + i * PK_ROW_BYTES, mat[i] + PK_NROWS / 8, PK_ROW_BYTES); + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128f/clean/pk_gen.h b/crypto_kem/mceliece6688128f/clean/pk_gen.h new file mode 100644 index 00000000..c9ce6584 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_PK_GEN_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE6688128F_CLEAN_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/clean/root.c b/crypto_kem/mceliece6688128f/clean/root.c new file mode 100644 index 00000000..655cf025 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/root.c @@ -0,0 +1,33 @@ +/* + This file is for evaluating a polynomial at one or more field elements +*/ +#include "root.h" + +#include "params.h" + +/* input: polynomial f and field element a */ +/* return f(a) */ +gf PQCLEAN_MCELIECE6688128F_CLEAN_eval(gf *f, gf a) { + int i; + gf r; + + r = f[ SYS_T ]; + + for (i = SYS_T - 1; i >= 0; i--) { + r = PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(r, a); + r = PQCLEAN_MCELIECE6688128F_CLEAN_gf_add(r, f[i]); + } + + return r; +} + +/* input: polynomial f and list of field elements L */ +/* output: out = [ f(a) for a in L ] */ +void PQCLEAN_MCELIECE6688128F_CLEAN_root(gf *out, gf *f, gf *L) { + int i; + + for (i = 0; i < SYS_N; i++) { + out[i] = PQCLEAN_MCELIECE6688128F_CLEAN_eval(f, L[i]); + } +} + diff --git a/crypto_kem/mceliece6688128f/clean/root.h b/crypto_kem/mceliece6688128f/clean/root.h new file mode 100644 index 00000000..faf363d7 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/root.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_ROOT_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_ROOT_H +/* + This file is for evaluating a polynomial at one or more field elements +*/ + + +#include "gf.h" + +gf PQCLEAN_MCELIECE6688128F_CLEAN_eval(gf * /*f*/, gf /*a*/); +void PQCLEAN_MCELIECE6688128F_CLEAN_root(gf * /*out*/, gf * /*f*/, gf * /*L*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/clean/sk_gen.c b/crypto_kem/mceliece6688128f/clean/sk_gen.c new file mode 100644 index 00000000..acee707e --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6688128F_CLEAN_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6688128F_CLEAN_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6688128F_CLEAN_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6688128F_CLEAN_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6688128F_CLEAN_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6688128F_CLEAN_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128f/clean/sk_gen.h b/crypto_kem/mceliece6688128f/clean/sk_gen.h new file mode 100644 index 00000000..ee170381 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_SK_GEN_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6688128F_CLEAN_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6688128F_CLEAN_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/clean/synd.c b/crypto_kem/mceliece6688128f/clean/synd.c new file mode 100644 index 00000000..ebd8c0d8 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/synd.c @@ -0,0 +1,33 @@ +/* + This file is for syndrome computation +*/ + +#include "synd.h" + +#include "params.h" +#include "root.h" + + +/* input: Goppa polynomial f, support L, received word r */ +/* output: out, the syndrome of length 2t */ +void PQCLEAN_MCELIECE6688128F_CLEAN_synd(gf *out, gf *f, gf *L, const unsigned char *r) { + int i, j; + gf e, e_inv, c; + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = 0; + } + + for (i = 0; i < SYS_N; i++) { + c = (r[i / 8] >> (i % 8)) & 1; + + e = PQCLEAN_MCELIECE6688128F_CLEAN_eval(f, L[i]); + e_inv = PQCLEAN_MCELIECE6688128F_CLEAN_gf_inv(PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(e, e)); + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = PQCLEAN_MCELIECE6688128F_CLEAN_gf_add(out[j], PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(e_inv, c)); + e_inv = PQCLEAN_MCELIECE6688128F_CLEAN_gf_mul(e_inv, L[i]); + } + } +} + diff --git a/crypto_kem/mceliece6688128f/clean/synd.h b/crypto_kem/mceliece6688128f/clean/synd.h new file mode 100644 index 00000000..f0cee745 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/synd.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_SYND_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_SYND_H +/* + This file is for syndrome computation +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE6688128F_CLEAN_synd(gf * /*out*/, gf * /*f*/, gf * /*L*/, const unsigned char * /*r*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/clean/transpose.c b/crypto_kem/mceliece6688128f/clean/transpose.c new file mode 100644 index 00000000..ad81cee6 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/transpose.c @@ -0,0 +1,42 @@ +/* + This file is for matrix transposition +*/ + +#include "transpose.h" + +#include + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE6688128F_CLEAN_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + diff --git a/crypto_kem/mceliece6688128f/clean/transpose.h b/crypto_kem/mceliece6688128f/clean/transpose.h new file mode 100644 index 00000000..14d974ae --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/transpose.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_TRANSPOSE_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + +void PQCLEAN_MCELIECE6688128F_CLEAN_transpose_64x64(uint64_t * /*out*/, const uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/clean/util.c b/crypto_kem/mceliece6688128f/clean/util.c new file mode 100644 index 00000000..1b84afe8 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/util.c @@ -0,0 +1,67 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ + +#include "util.h" + +#include "params.h" + +void PQCLEAN_MCELIECE6688128F_CLEAN_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6688128F_CLEAN_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6688128F_CLEAN_load4(const unsigned char *in) { + int i; + uint32_t ret = in[3]; + + for (i = 2; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +void PQCLEAN_MCELIECE6688128F_CLEAN_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6688128F_CLEAN_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE6688128F_CLEAN_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 3; +} + diff --git a/crypto_kem/mceliece6688128f/clean/util.h b/crypto_kem/mceliece6688128f/clean/util.h new file mode 100644 index 00000000..732e8243 --- /dev/null +++ b/crypto_kem/mceliece6688128f/clean/util.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6688128F_CLEAN_UTIL_H +#define PQCLEAN_MCELIECE6688128F_CLEAN_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include + +void PQCLEAN_MCELIECE6688128F_CLEAN_store2(unsigned char * /*dest*/, gf /*a*/); +uint16_t PQCLEAN_MCELIECE6688128F_CLEAN_load2(const unsigned char * /*src*/); + +uint32_t PQCLEAN_MCELIECE6688128F_CLEAN_load4(const unsigned char * /*in*/); + +void PQCLEAN_MCELIECE6688128F_CLEAN_store8(unsigned char * /*out*/, uint64_t /*in*/); +uint64_t PQCLEAN_MCELIECE6688128F_CLEAN_load8(const unsigned char * /*in*/); + +gf PQCLEAN_MCELIECE6688128F_CLEAN_bitrev(gf /*a*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/sse/LICENSE b/crypto_kem/mceliece6688128f/sse/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece6688128f/sse/Makefile b/crypto_kem/mceliece6688128f/sse/Makefile new file mode 100644 index 00000000..275b1d02 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/Makefile @@ -0,0 +1,37 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece6688128f_sse.a + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c operations.c pk_gen.c sk_gen.c util.c vec128.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S update_asm.S \ + vec128_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h params.h \ + pk_gen.h sk_gen.h transpose.h util.h vec128.h \ + consts.inc scalars_2x.inc scalars_4x.inc powers.inc + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o operations.o pk_gen.o sk_gen.o util.o vec128.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + update_asm.o vec128_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mpopcnt -mbmi -msse4.1 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece6688128f/sse/aes256ctr.c b/crypto_kem/mceliece6688128f/sse/aes256ctr.c new file mode 100644 index 00000000..12d22ca8 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE6688128F_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece6688128f/sse/aes256ctr.h b/crypto_kem/mceliece6688128f/sse/aes256ctr.h new file mode 100644 index 00000000..ae9b48f8 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_AES256CTR_H +#define PQCLEAN_MCELIECE6688128F_SSE_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6688128F_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6688128f/sse/api.h b/crypto_kem/mceliece6688128f/sse/api.h new file mode 100644 index 00000000..3a371b1d --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_API_H +#define PQCLEAN_MCELIECE6688128F_SSE_API_H + +#include + +#define PQCLEAN_MCELIECE6688128F_SSE_CRYPTO_ALGNAME "Classic McEliece 6688128" +#define PQCLEAN_MCELIECE6688128F_SSE_CRYPTO_PUBLICKEYBYTES 1044992 +#define PQCLEAN_MCELIECE6688128F_SSE_CRYPTO_SECRETKEYBYTES 13892 +#define PQCLEAN_MCELIECE6688128F_SSE_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE6688128F_SSE_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6688128F_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6688128F_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6688128F_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece6688128f/sse/benes.c b/crypto_kem/mceliece6688128f/sse/benes.c new file mode 100644 index 00000000..e53d48c0 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE6688128F_SSE_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(PQCLEAN_MCELIECE6688128F_SSE_load8(ptr), PQCLEAN_MCELIECE6688128F_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6688128F_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE6688128F_SSE_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(PQCLEAN_MCELIECE6688128F_SSE_load8(ptr), PQCLEAN_MCELIECE6688128F_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6688128F_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6688128F_SSE_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE6688128F_SSE_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6688128F_SSE_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6688128F_SSE_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE6688128F_SSE_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece6688128f/sse/benes.h b/crypto_kem/mceliece6688128f/sse/benes.h new file mode 100644 index 00000000..5678c828 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_BENES_H +#define PQCLEAN_MCELIECE6688128F_SSE_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE6688128F_SSE_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE6688128F_SSE_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/sse/bm.c b/crypto_kem/mceliece6688128f/sse/bm.c new file mode 100644 index 00000000..3eb1acc2 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/bm.c @@ -0,0 +1,208 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +extern gf PQCLEAN_MCELIECE6688128F_SSE_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE6688128F_SSE_update_asm(vec128 *, gf); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 *out, vec128 *in, uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE6688128F_SSE_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE6688128F_SSE_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(in[i], m0); + v1 = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(out[i], m1); + out[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_or(v0, v1); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE6688128F_SSE_vec128_or(PQCLEAN_MCELIECE6688128F_SSE_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE6688128F_SSE_vec128_sll_2x(PQCLEAN_MCELIECE6688128F_SSE_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE6688128F_SSE_vec128_or(PQCLEAN_MCELIECE6688128F_SSE_vec128_srl_2x(PQCLEAN_MCELIECE6688128F_SSE_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE6688128F_SSE_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6688128F_SSE_bm(vec128 *out, vec128 in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + vec128 dd[ GFBITS ], bb[ GFBITS ]; + vec128 B[ GFBITS ], C[ GFBITS ]; + vec128 B_tmp[ GFBITS ], C_tmp[ GFBITS ]; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[128], in[1]); + + C[0] = PQCLEAN_MCELIECE6688128F_SSE_vec128_setzero(); + B[0] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0, one << 63); + + for (i = 1; i < GFBITS; i++) { + C[i] = B[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_setzero(); + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(prod, C, (vec128 *) interval); + PQCLEAN_MCELIECE6688128F_SSE_update_asm(interval, coefs[N]); + d = PQCLEAN_MCELIECE6688128F_SSE_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE6688128F_SSE_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_setbits((d >> i) & 1); + bb[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(B_tmp, dd, B); + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(C_tmp, bb, C); + + vec128_cmov(B, C, mask); + PQCLEAN_MCELIECE6688128F_SSE_update_asm(B, c0 & mask); + + for (i = 0; i < GFBITS; i++) { + C[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(B_tmp[i], C_tmp[i]); + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE6688128F_SSE_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + out[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(out, out, C); +} + diff --git a/crypto_kem/mceliece6688128f/sse/bm.h b/crypto_kem/mceliece6688128f/sse/bm.h new file mode 100644 index 00000000..c4b842ba --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/bm.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_BM_H +#define PQCLEAN_MCELIECE6688128F_SSE_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE6688128F_SSE_bm(vec128 * /*out*/, vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6688128f/sse/consts.S b/crypto_kem/mceliece6688128f/sse/consts.S new file mode 100644 index 00000000..2ef828d3 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/consts.S @@ -0,0 +1,32 @@ +.data + +# not supported on macos +#.section .rodata +.globl PQCLEAN_MCELIECE6688128F_SSE_MASK0_0 +.globl PQCLEAN_MCELIECE6688128F_SSE_MASK0_1 +.globl PQCLEAN_MCELIECE6688128F_SSE_MASK1_0 +.globl PQCLEAN_MCELIECE6688128F_SSE_MASK1_1 +.globl PQCLEAN_MCELIECE6688128F_SSE_MASK2_0 +.globl PQCLEAN_MCELIECE6688128F_SSE_MASK2_1 +.globl PQCLEAN_MCELIECE6688128F_SSE_MASK3_0 +.globl PQCLEAN_MCELIECE6688128F_SSE_MASK3_1 +.globl PQCLEAN_MCELIECE6688128F_SSE_MASK4_0 +.globl PQCLEAN_MCELIECE6688128F_SSE_MASK4_1 +.globl PQCLEAN_MCELIECE6688128F_SSE_MASK5_0 +.globl PQCLEAN_MCELIECE6688128F_SSE_MASK5_1 + +.p2align 4 + +PQCLEAN_MCELIECE6688128F_SSE_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE6688128F_SSE_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE6688128F_SSE_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE6688128F_SSE_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE6688128F_SSE_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE6688128F_SSE_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE6688128F_SSE_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE6688128F_SSE_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE6688128F_SSE_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE6688128F_SSE_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE6688128F_SSE_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE6688128F_SSE_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece6688128f/sse/consts.inc b/crypto_kem/mceliece6688128f/sse/consts.inc new file mode 100644 index 00000000..5bf8f200 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/consts.inc @@ -0,0 +1,967 @@ +// 64 +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +// 128 +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), +}, +// 256 +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9669966969966996, 0X6996699696699669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +// 512 +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +// 1024 +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +// 2048 +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +// 4096 +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +} diff --git a/crypto_kem/mceliece6688128f/sse/controlbits.c b/crypto_kem/mceliece6688128f/sse/controlbits.c new file mode 100644 index 00000000..545e231f --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6688128F_SSE_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6688128F_SSE_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6688128F_SSE_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6688128F_SSE_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6688128f/sse/controlbits.h b/crypto_kem/mceliece6688128f/sse/controlbits.h new file mode 100644 index 00000000..223a66f1 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_CONTROLBITS_H +#define PQCLEAN_MCELIECE6688128F_SSE_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6688128F_SSE_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6688128F_SSE_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6688128f/sse/crypto_hash.h b/crypto_kem/mceliece6688128f/sse/crypto_hash.h new file mode 100644 index 00000000..eff22ccb --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6688128F_SSE_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6688128f/sse/decrypt.c b/crypto_kem/mceliece6688128f/sse/decrypt.c new file mode 100644 index 00000000..3b081429 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/decrypt.c @@ -0,0 +1,204 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec128 out[][GFBITS], vec128 inv[][GFBITS], const unsigned char *sk, vec128 *recv) { + int i, j; + + vec128 irr_int[ GFBITS ]; + vec128 eval[64][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE6688128F_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6688128F_SSE_fft(eval, irr_int); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6688128F_SSE_vec128_copy(inv[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128F_SSE_vec128_inv(tmp, inv[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128F_SSE_vec128_copy(inv[0], tmp); + + // + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE6688128F_SSE_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE6688128F_SSE_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE6688128F_SSE_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec128 out[][GFBITS], vec128 inv[][GFBITS], vec128 *recv) { + int i, j; + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec128 s0[][ GFBITS ], vec128 s1[][ GFBITS ]) { + int i, j; + vec128 diff; + + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_or(PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(s0[0][0], s1[0][0]), + PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(s0[1][0], s1[1][0])); + + for (i = 0; i < 2; i++) { + for (j = 1; j < GFBITS; j++) { + diff = PQCLEAN_MCELIECE6688128F_SSE_vec128_or(diff, PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(s0[i][j], s1[i][j])); + } + } + + return (uint16_t)PQCLEAN_MCELIECE6688128F_SSE_vec128_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6688128F_SSE_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec128 inv[ 64 ][ GFBITS ]; + vec128 scaled[ 64 ][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + + vec128 error[ 64 ]; + + vec128 s_priv[ 2 ][ GFBITS ]; + vec128 s_priv_cmp[ 2 ][ GFBITS ]; + + vec128 locator[ GFBITS ]; + + vec128 recv[ 64 ]; + vec128 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE6688128F_SSE_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE6688128F_SSE_benes(recv, bits_int, 1); + + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE6688128F_SSE_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6688128F_SSE_bm(locator, s_priv); + + PQCLEAN_MCELIECE6688128F_SSE_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6688128F_SSE_vec128_setbits(1); + + for (i = 0; i < 64; i++) { + error[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_or_reduce(eval[i]); + error[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(error[i], allone); + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE6688128F_SSE_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE6688128F_SSE_benes(error, bits_int, 0); + + postprocess(e, error); + + check_weight = weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece6688128f/sse/decrypt.h b/crypto_kem/mceliece6688128f/sse/decrypt.h new file mode 100644 index 00000000..372832d5 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_DECRYPT_H +#define PQCLEAN_MCELIECE6688128F_SSE_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6688128F_SSE_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/sse/encrypt.c b/crypto_kem/mceliece6688128f/sse/encrypt.c new file mode 100644 index 00000000..482379c2 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/encrypt.c @@ -0,0 +1,105 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE6688128F_SSE_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind[ SYS_T * 2 ]; + uint32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *)ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind32[i] == ind32[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6688128F_SSE_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6688128F_SSE_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + PQCLEAN_MCELIECE6688128F_SSE_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece6688128f/sse/encrypt.h b/crypto_kem/mceliece6688128f/sse/encrypt.h new file mode 100644 index 00000000..184d51cf --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_ENCRYPT_H +#define PQCLEAN_MCELIECE6688128F_SSE_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6688128F_SSE_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/sse/fft.c b/crypto_kem/mceliece6688128f/sse/fft.c new file mode 100644 index 00000000..057f9b25 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/fft.c @@ -0,0 +1,243 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec128.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128F_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128F_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(in, in, s[j]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec128 out[][ GFBITS ], const vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec128 t[ GFBITS ]; + vec128 pre[8][ GFBITS ]; + vec128 buf[64]; + + uint64_t v0, v1; + uint64_t consts_ptr = 1; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec128 powers[ 64 ][ GFBITS ] = { +#include "powers.inc" + }; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre[i + 0][j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_low(tmp[j], tmp[j]); + pre[i + 1][j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(in[i], 0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(in[i], 0) ^ PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(pre[6][i], 0)); + + buf[1] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[0], pre[0][i]); + buf[16] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[0], pre[4][i]); + buf[3] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[1], pre[1][i]); + buf[48] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[16], pre[5][i]); + buf[49] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[48], pre[0][i]); + buf[2] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[0], pre[1][i]); + buf[51] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[49], pre[1][i]); + buf[6] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[2], pre[2][i]); + buf[50] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[51], pre[0][i]); + buf[7] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[6], pre[0][i]); + buf[54] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[50], pre[2][i]); + buf[5] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[7], pre[1][i]); + buf[55] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[54], pre[0][i]); + buf[53] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[55], pre[1][i]); + buf[4] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[0], pre[2][i]); + buf[52] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[53], pre[0][i]); + buf[12] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[4], pre[3][i]); + buf[60] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[52], pre[3][i]); + buf[13] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[12], pre[0][i]); + buf[61] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[60], pre[0][i]); + buf[15] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[13], pre[1][i]); + buf[63] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[61], pre[1][i]); + buf[14] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[15], pre[0][i]); + buf[62] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[63], pre[0][i]); + buf[10] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[14], pre[2][i]); + buf[58] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[62], pre[2][i]); + buf[11] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[10], pre[0][i]); + buf[59] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[58], pre[0][i]); + buf[9] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[11], pre[1][i]); + buf[57] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[59], pre[1][i]); + buf[56] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[57], pre[0][i]); + buf[8] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[0], pre[3][i]); + buf[40] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[56], pre[4][i]); + buf[24] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[8], pre[4][i]); + buf[41] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[40], pre[0][i]); + buf[25] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[24], pre[0][i]); + buf[43] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[41], pre[1][i]); + buf[27] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[25], pre[1][i]); + buf[42] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[43], pre[0][i]); + buf[26] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[27], pre[0][i]); + buf[46] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[42], pre[2][i]); + buf[30] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[26], pre[2][i]); + buf[47] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[46], pre[0][i]); + buf[31] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[30], pre[0][i]); + buf[45] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[47], pre[1][i]); + buf[29] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[31], pre[1][i]); + buf[44] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[45], pre[0][i]); + buf[28] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[29], pre[0][i]); + buf[36] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[44], pre[3][i]); + buf[20] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[28], pre[3][i]); + buf[37] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[36], pre[0][i]); + buf[21] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[20], pre[0][i]); + buf[39] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[37], pre[1][i]); + buf[23] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[21], pre[1][i]); + buf[38] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[39], pre[0][i]); + buf[22] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[23], pre[0][i]); + buf[34] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[38], pre[2][i]); + buf[18] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[22], pre[2][i]); + buf[35] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[34], pre[0][i]); + buf[19] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[18], pre[0][i]); + buf[33] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[35], pre[1][i]); + buf[17] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[19], pre[1][i]); + buf[32] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[33], pre[0][i]); + + PQCLEAN_MCELIECE6688128F_SSE_transpose_64x128_sp(buf); + + for (j = 0; j < 64; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 0; i <= 5; i++) { + s = 1 << i; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(tmp, out[k + s], (vec128 *) consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(out[k ][b], tmp[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(out[k + s][b], out[k][b]); + } + } + } + + consts_ptr += (1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 64; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(out[i][b], powers[i][b]); + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6688128F_SSE_fft(vec128 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece6688128f/sse/fft.h b/crypto_kem/mceliece6688128f/sse/fft.h new file mode 100644 index 00000000..6303b50e --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_FFT_H +#define PQCLEAN_MCELIECE6688128F_SSE_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include + +void PQCLEAN_MCELIECE6688128F_SSE_fft(vec128 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/sse/fft_tr.c b/crypto_kem/mceliece6688128f/sse/fft_tr.c new file mode 100644 index 00000000..b2116d39 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/fft_tr.c @@ -0,0 +1,338 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec128 in[][ GFBITS ]) { + int i, j, k; + vec128 t, x0, x1; + + uint64_t v0, v1, v2, v3; + + const vec128 mask[6][2] = { + { + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec128 s[6][2][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(in[1], in[1], s[j][1]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(in[0][i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128F_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(in[0][i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128F_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(in[1][i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128F_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[1][i], t); + + t = PQCLEAN_MCELIECE6688128F_SSE_vec128_and(in[1][i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128F_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[1][i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + x0 = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_low(in[0][i], in[1][i]); + x1 = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_high(in[0][i], in[1][i]); + + x1 = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(x1, PQCLEAN_MCELIECE6688128F_SSE_vec128_srl_2x(x0, 32)); + x1 = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(x1, PQCLEAN_MCELIECE6688128F_SSE_vec128_sll_2x(x1, 32)); + + in[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_low(x0, x1); + in[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_high(x0, x1); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(in[0][i], 0); + v1 = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(in[0][i], 1); + v2 = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(in[1][i], 0); + v3 = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(in[1][i], 1); + + v3 ^= v2 ^= v1; + + in[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(v0, v1); + in[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(v2, v3); + } + + } +} + +static void butterflies_tr(vec128 out[][ GFBITS ], vec128 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec128 tmp0[ GFBITS ]; + vec128 tmp1[ GFBITS ]; + vec128 tmp[ GFBITS ]; + + vec128 pre[ 6 ][ GFBITS ]; + vec128 buf[ 64 ]; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 64; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 5; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[k][b], in[k + s][b]); + } + + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[k + s][b], tmp[b]); + } + } + } + } + + for (j = 0; j < 64; j += 2) { + for (i = 0; i < GFBITS; i++) { + tmp0[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_low(in[j][i], in[j + 1][i]); + tmp1[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_high(in[j][i], in[j + 1][i]); + } + + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(tmp0[b], tmp1[b]); + } + + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(tmp, tmp0, consts[0]); + + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(tmp1[b], tmp[b]); + } + + for (i = 0; i < GFBITS; i++) { + in[j + 0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_low(tmp0[i], tmp1[i]); + in[j + 1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_high(tmp0[i], tmp1[i]); + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 64; k++) { + buf[k] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE6688128F_SSE_transpose_64x128_sp(buf); + + pre[0][i] = buf[32]; + buf[33] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[33], buf[32]); + pre[1][i] = buf[33]; + buf[35] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[35], buf[33]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[35]); + buf[34] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[34], buf[35]); + pre[2][i] = buf[34]; + buf[38] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[38], buf[34]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[38]); + buf[39] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[39], buf[38]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[39]); + buf[37] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[37], buf[39]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[37]); + buf[36] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[36], buf[37]); + pre[3][i] = buf[36]; + buf[44] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[44], buf[36]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[44]); + buf[45] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[45], buf[44]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[45]); + buf[47] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[47], buf[45]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[47]); + buf[46] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[46], buf[47]); + pre[2][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[2][i], buf[46]); + buf[42] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[42], buf[46]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[42]); + buf[43] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[43], buf[42]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[43]); + buf[41] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[41], buf[43]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[41]); + buf[40] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[40], buf[41]); + pre[4][i] = buf[40]; + buf[56] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[56], buf[40]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[56]); + buf[57] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[57], buf[56]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[57]); + buf[59] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[59], buf[57]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[59]); + buf[58] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[58], buf[59]); + pre[2][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[2][i], buf[58]); + buf[62] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[62], buf[58]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[62]); + buf[63] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[63], buf[62]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[63]); + buf[61] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[61], buf[63]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[61]); + buf[60] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[60], buf[61]); + pre[3][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[3][i], buf[60]); + buf[52] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[52], buf[60]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[52]); + buf[53] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[53], buf[52]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[53]); + buf[55] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[55], buf[53]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[55]); + buf[54] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[54], buf[55]); + pre[2][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[2][i], buf[54]); + buf[50] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[50], buf[54]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[50]); + buf[51] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[51], buf[50]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[51]); + buf[49] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[49], buf[51]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[49]); + buf[48] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[48], buf[49]); + pre[5][i] = buf[48]; + buf[16] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[16], buf[48]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[16]); + buf[17] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[17], buf[16]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[17]); + buf[19] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[19], buf[17]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[19]); + buf[18] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[18], buf[19]); + pre[2][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[2][i], buf[18]); + buf[22] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[22], buf[18]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[22]); + buf[23] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[23], buf[22]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[23]); + buf[21] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[21], buf[23]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[21]); + buf[20] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[20], buf[21]); + pre[3][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[3][i], buf[20]); + buf[28] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[28], buf[20]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[28]); + buf[29] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[29], buf[28]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[29]); + buf[31] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[31], buf[29]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[31]); + buf[30] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[30], buf[31]); + pre[2][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[2][i], buf[30]); + buf[26] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[26], buf[30]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[26]); + buf[27] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[27], buf[26]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[27]); + buf[25] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[25], buf[27]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[25]); + buf[24] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[24], buf[25]); + pre[4][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[4][i], buf[24]); + buf[8] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[8], buf[24]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[8]); + buf[9] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[9], buf[8]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[9]); + buf[11] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[11], buf[9]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[11]); + buf[10] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[10], buf[11]); + pre[2][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[2][i], buf[10]); + buf[14] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[14], buf[10]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[14]); + buf[15] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[15], buf[14]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[15]); + buf[13] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[13], buf[15]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[13]); + buf[12] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[12], buf[13]); + pre[3][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[3][i], buf[12]); + buf[4] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[4], buf[12]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[4]); + buf[5] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[5], buf[4]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[5]); + buf[7] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[7], buf[5]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[7]); + buf[6] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[6], buf[7]); + pre[2][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[2][i], buf[6]); + buf[2] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[2], buf[6]); + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[2]); + buf[3] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[3], buf[2]); + pre[1][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[1][i], buf[3]); + buf[1] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[1], buf[3]); + + pre[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(pre[0][i], buf[1]); + out[0][i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(buf[0], buf[1]); + + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(out[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(tmp, pre[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out[1][b] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(out[1][b], tmp[b]); + } + } +} + +void PQCLEAN_MCELIECE6688128F_SSE_fft_tr(vec128 out[][GFBITS], vec128 in[][ GFBITS ]) { + butterflies_tr(out, in); + + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece6688128f/sse/fft_tr.h b/crypto_kem/mceliece6688128f/sse/fft_tr.h new file mode 100644 index 00000000..02d8ece9 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_FFT_TR_H +#define PQCLEAN_MCELIECE6688128F_SSE_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE6688128F_SSE_fft_tr(vec128 /*out*/[][GFBITS], vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6688128f/sse/gf.c b/crypto_kem/mceliece6688128f/sse/gf.c new file mode 100644 index 00000000..a2fa98ce --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* field multiplication */ +gf PQCLEAN_MCELIECE6688128F_SSE_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE6688128F_SSE_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6688128F_SSE_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6688128F_SSE_gf_inv(gf in) { + return PQCLEAN_MCELIECE6688128F_SSE_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6688128F_SSE_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6688128F_SSE_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6688128F_SSE_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE6688128F_SSE_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE6688128F_SSE_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE6688128F_SSE_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6688128f/sse/gf.h b/crypto_kem/mceliece6688128f/sse/gf.h new file mode 100644 index 00000000..758ab8bb --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/gf.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_GF_H +#define PQCLEAN_MCELIECE6688128F_SSE_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6688128F_SSE_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE6688128F_SSE_gf_mul(gf /*in0*/, gf /*in1*/); +uint64_t PQCLEAN_MCELIECE6688128F_SSE_gf_mul2(gf /*a*/, gf /*b0*/, gf /*b1*/); +gf PQCLEAN_MCELIECE6688128F_SSE_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE6688128F_SSE_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE6688128F_SSE_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/sse/operations.c b/crypto_kem/mceliece6688128f/sse/operations.c new file mode 100644 index 00000000..699cab20 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6688128F_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6688128F_SSE_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128F_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6688128F_SSE_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128F_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6688128F_SSE_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6688128F_SSE_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6688128F_SSE_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6688128F_SSE_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6688128F_SSE_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6688128F_SSE_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6688128F_SSE_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6688128F_SSE_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128f/sse/params.h b/crypto_kem/mceliece6688128f/sse/params.h new file mode 100644 index 00000000..7501664e --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_PARAMS_H +#define PQCLEAN_MCELIECE6688128F_SSE_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6688 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6688128f/sse/pk_gen.c b/crypto_kem/mceliece6688128f/sse/pk_gen.c new file mode 100644 index 00000000..61067e44 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/pk_gen.c @@ -0,0 +1,346 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec128 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 128 + 0 * 64 + r] <<= 1; + out[i * 128 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 128 + 1 * 64 + r] <<= 1; + out[i * 128 + 1 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec128 out0[][GFBITS], vec128 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[2] = {0}; + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(u[0], u[1]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(u[0], u[1]); + } + } +} + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 127) / 128) * 2 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << 32 >> 32) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> 32 << 32) | (buf[j] >> 32); + } + } + + return 0; +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 127) / 128) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE6688128F_SSE_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 2 ]; + + uint64_t mask; + + vec128 irr_int[ GFBITS ]; + + vec128 consts[64][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + vec128 prod[ 64 ][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE6688128F_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6688128F_SSE_fft(eval, irr_int); + + PQCLEAN_MCELIECE6688128F_SSE_vec128_copy(prod[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128F_SSE_vec128_inv(tmp, prod[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128F_SSE_vec128_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6688128F_SSE_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = NBLOCKS1_I; j < NBLOCKS1_H - 1; j++) { + PQCLEAN_MCELIECE6688128F_SSE_store8(pk, mat[i][j]); + pk += 8; + } + + PQCLEAN_MCELIECE6688128F_SSE_store_i(pk, mat[i][j], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece6688128f/sse/pk_gen.h b/crypto_kem/mceliece6688128f/sse/pk_gen.h new file mode 100644 index 00000000..24eaa11d --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_PK_GEN_H +#define PQCLEAN_MCELIECE6688128F_SSE_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6688128F_SSE_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/sse/powers.inc b/crypto_kem/mceliece6688128f/sse/powers.inc new file mode 100644 index 00000000..da05dcc8 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/powers.inc @@ -0,0 +1,960 @@ +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +} diff --git a/crypto_kem/mceliece6688128f/sse/scalars_2x.inc b/crypto_kem/mceliece6688128f/sse/scalars_2x.inc new file mode 100644 index 00000000..db1572c7 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece6688128f/sse/scalars_4x.inc b/crypto_kem/mceliece6688128f/sse/scalars_4x.inc new file mode 100644 index 00000000..363ea011 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/scalars_4x.inc @@ -0,0 +1,181 @@ +{{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF), +}}, +{{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FF00000000FF, 0X0000FF000000FFFF), +}}, +{{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), +}}, +{{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), +}} + diff --git a/crypto_kem/mceliece6688128f/sse/sk_gen.c b/crypto_kem/mceliece6688128f/sse/sk_gen.c new file mode 100644 index 00000000..2800eb1c --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6688128F_SSE_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6688128F_SSE_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6688128F_SSE_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6688128F_SSE_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6688128F_SSE_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6688128F_SSE_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6688128F_SSE_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6688128F_SSE_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128f/sse/sk_gen.h b/crypto_kem/mceliece6688128f/sse/sk_gen.h new file mode 100644 index 00000000..3654c020 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_SK_GEN_H +#define PQCLEAN_MCELIECE6688128F_SSE_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6688128F_SSE_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6688128F_SSE_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/sse/syndrome_asm.S b/crypto_kem/mceliece6688128f/sse/syndrome_asm.S new file mode 100644 index 00000000..31cd52f2 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/syndrome_asm.S @@ -0,0 +1,1260 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg128 pp + +# qhasm: reg128 ee + +# qhasm: reg128 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack128 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128F_SSE_syndrome_asm +.global PQCLEAN_MCELIECE6688128F_SSE_syndrome_asm +_PQCLEAN_MCELIECE6688128F_SSE_syndrome_asm: +PQCLEAN_MCELIECE6688128F_SSE_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 1044364 +# asm 1: add $1044364,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1664 +# asm 1: mov $1664,>row=int64#5 +# asm 2: mov $1664,>row=%r8 +mov $1664,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rsi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 208 ] +# asm 1: movdqu 208(ee=reg128#2 +# asm 2: movdqu 208(ee=%xmm1 +movdqu 208(%rdx),%xmm1 + +# qhasm: ss &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 16(pp=%xmm1 +movdqu 16(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 224 ] +# asm 1: movdqu 224(ee=reg128#3 +# asm 2: movdqu 224(ee=%xmm2 +movdqu 224(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 32(pp=%xmm1 +movdqu 32(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 240 ] +# asm 1: movdqu 240(ee=reg128#3 +# asm 2: movdqu 240(ee=%xmm2 +movdqu 240(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 48(pp=%xmm1 +movdqu 48(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 256 ] +# asm 1: movdqu 256(ee=reg128#3 +# asm 2: movdqu 256(ee=%xmm2 +movdqu 256(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 64(pp=%xmm1 +movdqu 64(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 272 ] +# asm 1: movdqu 272(ee=reg128#3 +# asm 2: movdqu 272(ee=%xmm2 +movdqu 272(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 80(pp=%xmm1 +movdqu 80(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 288 ] +# asm 1: movdqu 288(ee=reg128#3 +# asm 2: movdqu 288(ee=%xmm2 +movdqu 288(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 96(pp=%xmm1 +movdqu 96(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 304 ] +# asm 1: movdqu 304(ee=reg128#3 +# asm 2: movdqu 304(ee=%xmm2 +movdqu 304(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 112(pp=%xmm1 +movdqu 112(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 320 ] +# asm 1: movdqu 320(ee=reg128#3 +# asm 2: movdqu 320(ee=%xmm2 +movdqu 320(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 128(pp=%xmm1 +movdqu 128(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 336 ] +# asm 1: movdqu 336(ee=reg128#3 +# asm 2: movdqu 336(ee=%xmm2 +movdqu 336(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 144(pp=%xmm1 +movdqu 144(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 352 ] +# asm 1: movdqu 352(ee=reg128#3 +# asm 2: movdqu 352(ee=%xmm2 +movdqu 352(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 160(pp=%xmm1 +movdqu 160(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 368 ] +# asm 1: movdqu 368(ee=reg128#3 +# asm 2: movdqu 368(ee=%xmm2 +movdqu 368(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 176(pp=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 384 ] +# asm 1: movdqu 384(ee=reg128#3 +# asm 2: movdqu 384(ee=%xmm2 +movdqu 384(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 192(pp=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 400 ] +# asm 1: movdqu 400(ee=reg128#3 +# asm 2: movdqu 400(ee=%xmm2 +movdqu 400(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 208(pp=%xmm1 +movdqu 208(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 416 ] +# asm 1: movdqu 416(ee=reg128#3 +# asm 2: movdqu 416(ee=%xmm2 +movdqu 416(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 224(pp=%xmm1 +movdqu 224(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 432 ] +# asm 1: movdqu 432(ee=reg128#3 +# asm 2: movdqu 432(ee=%xmm2 +movdqu 432(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 240(pp=%xmm1 +movdqu 240(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 448 ] +# asm 1: movdqu 448(ee=reg128#3 +# asm 2: movdqu 448(ee=%xmm2 +movdqu 448(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 256(pp=%xmm1 +movdqu 256(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 464 ] +# asm 1: movdqu 464(ee=reg128#3 +# asm 2: movdqu 464(ee=%xmm2 +movdqu 464(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 272(pp=%xmm1 +movdqu 272(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 480 ] +# asm 1: movdqu 480(ee=reg128#3 +# asm 2: movdqu 480(ee=%xmm2 +movdqu 480(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 288(pp=%xmm1 +movdqu 288(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 496 ] +# asm 1: movdqu 496(ee=reg128#3 +# asm 2: movdqu 496(ee=%xmm2 +movdqu 496(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 304(pp=%xmm1 +movdqu 304(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 512 ] +# asm 1: movdqu 512(ee=reg128#3 +# asm 2: movdqu 512(ee=%xmm2 +movdqu 512(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 320(pp=%xmm1 +movdqu 320(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 528 ] +# asm 1: movdqu 528(ee=reg128#3 +# asm 2: movdqu 528(ee=%xmm2 +movdqu 528(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 336(pp=%xmm1 +movdqu 336(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 544 ] +# asm 1: movdqu 544(ee=reg128#3 +# asm 2: movdqu 544(ee=%xmm2 +movdqu 544(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 352(pp=%xmm1 +movdqu 352(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 560 ] +# asm 1: movdqu 560(ee=reg128#3 +# asm 2: movdqu 560(ee=%xmm2 +movdqu 560(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 368(pp=%xmm1 +movdqu 368(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 576 ] +# asm 1: movdqu 576(ee=reg128#3 +# asm 2: movdqu 576(ee=%xmm2 +movdqu 576(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 384(pp=%xmm1 +movdqu 384(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 592 ] +# asm 1: movdqu 592(ee=reg128#3 +# asm 2: movdqu 592(ee=%xmm2 +movdqu 592(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 400(pp=%xmm1 +movdqu 400(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 608 ] +# asm 1: movdqu 608(ee=reg128#3 +# asm 2: movdqu 608(ee=%xmm2 +movdqu 608(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 416(pp=%xmm1 +movdqu 416(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 624 ] +# asm 1: movdqu 624(ee=reg128#3 +# asm 2: movdqu 624(ee=%xmm2 +movdqu 624(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 432(pp=%xmm1 +movdqu 432(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 640 ] +# asm 1: movdqu 640(ee=reg128#3 +# asm 2: movdqu 640(ee=%xmm2 +movdqu 640(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 448(pp=%xmm1 +movdqu 448(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 656 ] +# asm 1: movdqu 656(ee=reg128#3 +# asm 2: movdqu 656(ee=%xmm2 +movdqu 656(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 464(pp=%xmm1 +movdqu 464(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 672 ] +# asm 1: movdqu 672(ee=reg128#3 +# asm 2: movdqu 672(ee=%xmm2 +movdqu 672(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 480(pp=%xmm1 +movdqu 480(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 688 ] +# asm 1: movdqu 688(ee=reg128#3 +# asm 2: movdqu 688(ee=%xmm2 +movdqu 688(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 496(pp=%xmm1 +movdqu 496(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 704 ] +# asm 1: movdqu 704(ee=reg128#3 +# asm 2: movdqu 704(ee=%xmm2 +movdqu 704(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 512(pp=%xmm1 +movdqu 512(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 720 ] +# asm 1: movdqu 720(ee=reg128#3 +# asm 2: movdqu 720(ee=%xmm2 +movdqu 720(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 528(pp=%xmm1 +movdqu 528(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 736 ] +# asm 1: movdqu 736(ee=reg128#3 +# asm 2: movdqu 736(ee=%xmm2 +movdqu 736(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 544(pp=%xmm1 +movdqu 544(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 752 ] +# asm 1: movdqu 752(ee=reg128#3 +# asm 2: movdqu 752(ee=%xmm2 +movdqu 752(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 560(pp=%xmm1 +movdqu 560(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 768 ] +# asm 1: movdqu 768(ee=reg128#3 +# asm 2: movdqu 768(ee=%xmm2 +movdqu 768(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 576(pp=%xmm1 +movdqu 576(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 784 ] +# asm 1: movdqu 784(ee=reg128#3 +# asm 2: movdqu 784(ee=%xmm2 +movdqu 784(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 592(pp=%xmm1 +movdqu 592(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 800 ] +# asm 1: movdqu 800(ee=reg128#3 +# asm 2: movdqu 800(ee=%xmm2 +movdqu 800(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand buf=stack128#1 +# asm 2: movdqa buf=0(%rsp) +movdqa %xmm0,0(%rsp) + +# qhasm: s = mem64[input_1 + 608] +# asm 1: movq 608(s=int64#6 +# asm 2: movq 608(s=%r9 +movq 608(%rsi),%r9 + +# qhasm: e = mem64[input_2 + 816] +# asm 1: movq 816(e=int64#7 +# asm 2: movq 816(e=%rax +movq 816(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 616(p=%rax +movq 616(%rsi),%rax + +# qhasm: e = mem64[input_2 + 824] +# asm 1: movq 824(e=int64#8 +# asm 2: movq 824(e=%r10 +movq 824(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and p=int64#7d +# asm 2: movl 624(p=%eax +movl 624(%rsi),%eax + +# qhasm: e = *(uint32 *)(input_2 + 832) +# asm 1: movl 832(e=int64#8d +# asm 2: movl 832(e=%r10d +movl 832(%rdx),%r10d + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(ee=reg128#2 +# asm 2: movdqu 0(ee=%xmm1 +movdqu 0(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 16(ss=%xmm0 +movdqu 16(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 16 ] +# asm 1: movdqu 16(ee=reg128#2 +# asm 2: movdqu 16(ee=%xmm1 +movdqu 16(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 32(ss=%xmm0 +movdqu 32(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 32 ] +# asm 1: movdqu 32(ee=reg128#2 +# asm 2: movdqu 32(ee=%xmm1 +movdqu 32(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 48(ss=%xmm0 +movdqu 48(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 48 ] +# asm 1: movdqu 48(ee=reg128#2 +# asm 2: movdqu 48(ee=%xmm1 +movdqu 48(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 64(ss=%xmm0 +movdqu 64(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 64 ] +# asm 1: movdqu 64(ee=reg128#2 +# asm 2: movdqu 64(ee=%xmm1 +movdqu 64(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 80(ss=%xmm0 +movdqu 80(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 80 ] +# asm 1: movdqu 80(ee=reg128#2 +# asm 2: movdqu 80(ee=%xmm1 +movdqu 80(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 96(ss=%xmm0 +movdqu 96(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 96 ] +# asm 1: movdqu 96(ee=reg128#2 +# asm 2: movdqu 96(ee=%xmm1 +movdqu 96(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 112(ss=%xmm0 +movdqu 112(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 112 ] +# asm 1: movdqu 112(ee=reg128#2 +# asm 2: movdqu 112(ee=%xmm1 +movdqu 112(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 128(ss=%xmm0 +movdqu 128(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 128 ] +# asm 1: movdqu 128(ee=reg128#2 +# asm 2: movdqu 128(ee=%xmm1 +movdqu 128(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 144(ss=%xmm0 +movdqu 144(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 144 ] +# asm 1: movdqu 144(ee=reg128#2 +# asm 2: movdqu 144(ee=%xmm1 +movdqu 144(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 160(ss=%xmm0 +movdqu 160(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 160 ] +# asm 1: movdqu 160(ee=reg128#2 +# asm 2: movdqu 160(ee=%xmm1 +movdqu 160(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 176(ss=%xmm0 +movdqu 176(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 176 ] +# asm 1: movdqu 176(ee=reg128#2 +# asm 2: movdqu 176(ee=%xmm1 +movdqu 176(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 192(ss=%xmm0 +movdqu 192(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 192 ] +# asm 1: movdqu 192(ee=reg128#2 +# asm 2: movdqu 192(ee=%xmm1 +movdqu 192(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6688128F_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6688128F_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6688128F_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6688128F_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6688128F_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6688128F_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6688128F_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6688128F_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6688128F_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6688128F_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6688128F_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#3 +# asm 2: movq 0(s0=%rdx +movq 0(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#4 +# asm 2: movq 8(s1=%rcx +movq 8(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 16(s0=%rdx +movq 16(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(s1=int64#4 +# asm 2: movq 24(s1=%rcx +movq 24(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 32(s0=%rdx +movq 32(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(s1=int64#4 +# asm 2: movq 40(s1=%rcx +movq 40(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 48(s0=%rdx +movq 48(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(s1=int64#4 +# asm 2: movq 56(s1=%rcx +movq 56(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 64(s0=%rdx +movq 64(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(s1=int64#4 +# asm 2: movq 72(s1=%rcx +movq 72(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 80(s0=%rdx +movq 80(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(s1=int64#4 +# asm 2: movq 88(s1=%rcx +movq 88(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 96(s0=%rdx +movq 96(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(s1=int64#4 +# asm 2: movq 104(s1=%rcx +movq 104(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 112(s0=%rdx +movq 112(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(s1=int64#4 +# asm 2: movq 120(s1=%rcx +movq 120(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 128(s0=%rdx +movq 128(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(s1=int64#4 +# asm 2: movq 136(s1=%rcx +movq 136(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 144(s0=%rdx +movq 144(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(s1=int64#4 +# asm 2: movq 152(s1=%rcx +movq 152(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 160(s0=%rdx +movq 160(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(s1=int64#4 +# asm 2: movq 168(s1=%rcx +movq 168(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 176(s0=%rdx +movq 176(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(s1=int64#4 +# asm 2: movq 184(s1=%rcx +movq 184(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 192(s0=%rdx +movq 192(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(s1=int64#4 +# asm 2: movq 200(s1=%rcx +movq 200(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE6688128F_SSE_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6688128F_SSE_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6688128F_SSE_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6688128F_SSE_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6688128F_SSE_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6688128F_SSE_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6688128F_SSE_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6688128F_SSE_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE6688128F_SSE_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x( PQCLEAN_MCELIECE6688128F_SSE_load8(in), PQCLEAN_MCELIECE6688128F_SSE_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE6688128F_SSE_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE6688128F_SSE_store8(out + 0, PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(in, 0)); + PQCLEAN_MCELIECE6688128F_SSE_store8(out + 8, PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece6688128f/sse/util.h b/crypto_kem/mceliece6688128f/sse/util.h new file mode 100644 index 00000000..35047efd --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_UTIL_H +#define PQCLEAN_MCELIECE6688128F_SSE_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE6688128F_SSE_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE6688128F_SSE_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE6688128F_SSE_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE6688128F_SSE_load4(const unsigned char *src); +void PQCLEAN_MCELIECE6688128F_SSE_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE6688128F_SSE_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE6688128F_SSE_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE6688128F_SSE_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE6688128F_SSE_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece6688128f/sse/vec128.c b/crypto_kem/mceliece6688128f/sse/vec128.c new file mode 100644 index 00000000..f74a1bb0 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/vec128.c @@ -0,0 +1,152 @@ +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + +#include "vec128.h" + +#include "params.h" + + +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE6688128F_SSE_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE6688128F_SSE_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE6688128F_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE6688128F_SSE_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul_asm(h, f, g, 16); +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(vec128 *out, const vec128 *in) { + int i; + vec128 result[GFBITS], t; + + t = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[11], in[12]); + + result[0] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[0], in[11]); + result[1] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[7], t); + result[2] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[1], in[7]); + result[3] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[8], t); + result[4] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[2], in[7]); + result[4] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(result[4], in[8]); + result[4] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(result[4], t); + result[5] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[7], in[9]); + result[6] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[3], in[8]); + result[6] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(result[6], in[9]); + result[6] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(result[6], in[12]); + result[7] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[8], in[10]); + result[8] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[4], in[9]); + result[8] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(result[8], in[10]); + result[9] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[9], in[11]); + result[10] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[5], in[10]); + result[10] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(result[10], in[11]); + result[11] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[10], in[12]); + result[12] = PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(in[6], t); + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE6688128F_SSE_vec128_inv(vec128 *out, const vec128 *in) { + vec128 tmp_11[ GFBITS ]; + vec128 tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE6688128F_SSE_vec128_copy(out, in); + + PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(out, tmp_11); + PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(out, tmp_1111); + PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece6688128f/sse/vec128.h b/crypto_kem/mceliece6688128f/sse/vec128.h new file mode 100644 index 00000000..a7ab8ebc --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/vec128.h @@ -0,0 +1,43 @@ +#ifndef PQCLEAN_MCELIECE6688128F_SSE_VEC128_H +#define PQCLEAN_MCELIECE6688128F_SSE_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE6688128F_SSE_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE6688128F_SSE_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE6688128F_SSE_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE6688128F_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE6688128F_SSE_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE6688128F_SSE_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128F_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +void PQCLEAN_MCELIECE6688128F_SSE_vec128_sq(vec128 *out, const vec128 *in); +void PQCLEAN_MCELIECE6688128F_SSE_vec128_inv(vec128 *out, const vec128 *in); +#endif diff --git a/crypto_kem/mceliece6688128f/sse/vec128_mul_asm.S b/crypto_kem/mceliece6688128f/sse/vec128_mul_asm.S new file mode 100644 index 00000000..383ebb27 --- /dev/null +++ b/crypto_kem/mceliece6688128f/sse/vec128_mul_asm.S @@ -0,0 +1,2127 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 a0 + +# qhasm: reg128 a1 + +# qhasm: reg128 a2 + +# qhasm: reg128 a3 + +# qhasm: reg128 a4 + +# qhasm: reg128 a5 + +# qhasm: reg128 a6 + +# qhasm: reg128 a7 + +# qhasm: reg128 a8 + +# qhasm: reg128 a9 + +# qhasm: reg128 a10 + +# qhasm: reg128 a11 + +# qhasm: reg128 a12 + +# qhasm: reg128 b0 + +# qhasm: reg128 b1 + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 r5 + +# qhasm: reg128 r6 + +# qhasm: reg128 r7 + +# qhasm: reg128 r8 + +# qhasm: reg128 r9 + +# qhasm: reg128 r10 + +# qhasm: reg128 r11 + +# qhasm: reg128 r12 + +# qhasm: reg128 r13 + +# qhasm: reg128 r14 + +# qhasm: reg128 r15 + +# qhasm: reg128 r16 + +# qhasm: reg128 r17 + +# qhasm: reg128 r18 + +# qhasm: reg128 r19 + +# qhasm: reg128 r20 + +# qhasm: reg128 r21 + +# qhasm: reg128 r22 + +# qhasm: reg128 r23 + +# qhasm: reg128 r24 + +# qhasm: reg128 r + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128F_SSE_vec128_mul_asm +.global PQCLEAN_MCELIECE6688128F_SSE_vec128_mul_asm +_PQCLEAN_MCELIECE6688128F_SSE_vec128_mul_asm: +PQCLEAN_MCELIECE6688128F_SSE_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(b0=reg128#1 +# asm 2: movdqu 0(b0=%xmm0 +movdqu 0(%rdx),%xmm0 + +# qhasm: a12 = mem128[ input_1 + 192 ] +# asm 1: movdqu 192(a12=reg128#2 +# asm 2: movdqu 192(a12=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg128#3 +# asm 2: vpand r12=%xmm2 +vpand %xmm0,%xmm1,%xmm2 + +# qhasm: r13 = a12 & mem128[input_2 + 16] +# asm 1: vpand 16(r13=reg128#4 +# asm 2: vpand 16(r13=%xmm3 +vpand 16(%rdx),%xmm1,%xmm3 + +# qhasm: r14 = a12 & mem128[input_2 + 32] +# asm 1: vpand 32(r14=reg128#5 +# asm 2: vpand 32(r14=%xmm4 +vpand 32(%rdx),%xmm1,%xmm4 + +# qhasm: r15 = a12 & mem128[input_2 + 48] +# asm 1: vpand 48(r15=reg128#6 +# asm 2: vpand 48(r15=%xmm5 +vpand 48(%rdx),%xmm1,%xmm5 + +# qhasm: r16 = a12 & mem128[input_2 + 64] +# asm 1: vpand 64(r16=reg128#7 +# asm 2: vpand 64(r16=%xmm6 +vpand 64(%rdx),%xmm1,%xmm6 + +# qhasm: r17 = a12 & mem128[input_2 + 80] +# asm 1: vpand 80(r17=reg128#8 +# asm 2: vpand 80(r17=%xmm7 +vpand 80(%rdx),%xmm1,%xmm7 + +# qhasm: r18 = a12 & mem128[input_2 + 96] +# asm 1: vpand 96(r18=reg128#9 +# asm 2: vpand 96(r18=%xmm8 +vpand 96(%rdx),%xmm1,%xmm8 + +# qhasm: r19 = a12 & mem128[input_2 + 112] +# asm 1: vpand 112(r19=reg128#10 +# asm 2: vpand 112(r19=%xmm9 +vpand 112(%rdx),%xmm1,%xmm9 + +# qhasm: r20 = a12 & mem128[input_2 + 128] +# asm 1: vpand 128(r20=reg128#11 +# asm 2: vpand 128(r20=%xmm10 +vpand 128(%rdx),%xmm1,%xmm10 + +# qhasm: r21 = a12 & mem128[input_2 + 144] +# asm 1: vpand 144(r21=reg128#12 +# asm 2: vpand 144(r21=%xmm11 +vpand 144(%rdx),%xmm1,%xmm11 + +# qhasm: r22 = a12 & mem128[input_2 + 160] +# asm 1: vpand 160(r22=reg128#13 +# asm 2: vpand 160(r22=%xmm12 +vpand 160(%rdx),%xmm1,%xmm12 + +# qhasm: r23 = a12 & mem128[input_2 + 176] +# asm 1: vpand 176(r23=reg128#14 +# asm 2: vpand 176(r23=%xmm13 +vpand 176(%rdx),%xmm1,%xmm13 + +# qhasm: r24 = a12 & mem128[input_2 + 192] +# asm 1: vpand 192(r24=reg128#2 +# asm 2: vpand 192(r24=%xmm1 +vpand 192(%rdx),%xmm1,%xmm1 + +# qhasm: r15 ^= r24 +# asm 1: pxor r11=reg128#2 +# asm 2: movdqa r11=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: a11 = mem128[ input_1 + 176 ] +# asm 1: movdqu 176(a11=reg128#15 +# asm 2: movdqu 176(a11=%xmm14 +movdqu 176(%rsi),%xmm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r22 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r23 ^= r +# asm 1: pxor r10=reg128#14 +# asm 2: movdqa r10=%xmm13 +movdqa %xmm13,%xmm13 + +# qhasm: a10 = mem128[ input_1 + 160 ] +# asm 1: movdqu 160(a10=reg128#15 +# asm 2: movdqu 160(a10=%xmm14 +movdqu 160(%rsi),%xmm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r22 ^= r +# asm 1: pxor r9=reg128#13 +# asm 2: movdqa r9=%xmm12 +movdqa %xmm12,%xmm12 + +# qhasm: a9 = mem128[ input_1 + 144 ] +# asm 1: movdqu 144(a9=reg128#15 +# asm 2: movdqu 144(a9=%xmm14 +movdqu 144(%rsi),%xmm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r21 ^= r +# asm 1: pxor r8=reg128#12 +# asm 2: movdqa r8=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: a8 = mem128[ input_1 + 128 ] +# asm 1: movdqu 128(a8=reg128#15 +# asm 2: movdqu 128(a8=%xmm14 +movdqu 128(%rsi),%xmm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r20 ^= r +# asm 1: pxor r7=reg128#11 +# asm 2: movdqa r7=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: a7 = mem128[ input_1 + 112 ] +# asm 1: movdqu 112(a7=reg128#15 +# asm 2: movdqu 112(a7=%xmm14 +movdqu 112(%rsi),%xmm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r6=reg128#10 +# asm 2: movdqa r6=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: a6 = mem128[ input_1 + 96 ] +# asm 1: movdqu 96(a6=reg128#15 +# asm 2: movdqu 96(a6=%xmm14 +movdqu 96(%rsi),%xmm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r5=reg128#9 +# asm 2: movdqa r5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: a5 = mem128[ input_1 + 80 ] +# asm 1: movdqu 80(a5=reg128#15 +# asm 2: movdqu 80(a5=%xmm14 +movdqu 80(%rsi),%xmm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r4=reg128#8 +# asm 2: movdqa r4=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: a4 = mem128[ input_1 + 64 ] +# asm 1: movdqu 64(a4=reg128#15 +# asm 2: movdqu 64(a4=%xmm14 +movdqu 64(%rsi),%xmm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r3=reg128#7 +# asm 2: movdqa r3=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: a3 = mem128[ input_1 + 48 ] +# asm 1: movdqu 48(a3=reg128#15 +# asm 2: movdqu 48(a3=%xmm14 +movdqu 48(%rsi),%xmm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r2=reg128#6 +# asm 2: movdqa r2=%xmm5 +movdqa %xmm5,%xmm5 + +# qhasm: a2 = mem128[ input_1 + 32 ] +# asm 1: movdqu 32(a2=reg128#15 +# asm 2: movdqu 32(a2=%xmm14 +movdqu 32(%rsi),%xmm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r1=reg128#5 +# asm 2: movdqa r1=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: a1 = mem128[ input_1 + 16 ] +# asm 1: movdqu 16(a1=reg128#15 +# asm 2: movdqu 16(a1=%xmm14 +movdqu 16(%rsi),%xmm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r0=reg128#4 +# asm 2: movdqa r0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: a0 = mem128[ input_1 + 0 ] +# asm 1: movdqu 0(a0=reg128#15 +# asm 2: movdqu 0(a0=%xmm14 +movdqu 0(%rsi),%xmm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm0,%xmm14,%xmm0 + +# qhasm: r0 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 16(r=%xmm0 +vpand 16(%rdx),%xmm14,%xmm0 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 32(r=%xmm0 +vpand 32(%rdx),%xmm14,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 48(r=%xmm0 +vpand 48(%rdx),%xmm14,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 64(r=%xmm0 +vpand 64(%rdx),%xmm14,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 80(r=%xmm0 +vpand 80(%rdx),%xmm14,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 96(r=%xmm0 +vpand 96(%rdx),%xmm14,%xmm0 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 112(r=%xmm0 +vpand 112(%rdx),%xmm14,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 128(r=%xmm0 +vpand 128(%rdx),%xmm14,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 144(r=%xmm0 +vpand 144(%rdx),%xmm14,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 160(r=%xmm0 +vpand 160(%rdx),%xmm14,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 176(r=%xmm0 +vpand 176(%rdx),%xmm14,%xmm0 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 192(r=%xmm0 +vpand 192(%rdx),%xmm14,%xmm0 + +# qhasm: r12 ^= r +# asm 1: pxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6688128F_VEC_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6688128f/vec/api.h b/crypto_kem/mceliece6688128f/vec/api.h new file mode 100644 index 00000000..cd45ca3a --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_API_H +#define PQCLEAN_MCELIECE6688128F_VEC_API_H + +#include + +#define PQCLEAN_MCELIECE6688128F_VEC_CRYPTO_ALGNAME "Classic McEliece 6688128" +#define PQCLEAN_MCELIECE6688128F_VEC_CRYPTO_PUBLICKEYBYTES 1044992 +#define PQCLEAN_MCELIECE6688128F_VEC_CRYPTO_SECRETKEYBYTES 13892 +#define PQCLEAN_MCELIECE6688128F_VEC_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE6688128F_VEC_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/benes.c b/crypto_kem/mceliece6688128f/vec/benes.c new file mode 100644 index 00000000..253ca15a --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/benes.c @@ -0,0 +1,147 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" +#include "vec.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6688128F_VEC_benes(vec *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = r[i * 2 + 0]; + r_int_v[1][i] = r[i * 2 + 1]; + } + + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6688128F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + r[i * 2 + 0] = r_int_v[0][i]; + r[i * 2 + 1] = r_int_v[1][i]; + } +} + diff --git a/crypto_kem/mceliece6688128f/vec/benes.h b/crypto_kem/mceliece6688128f/vec/benes.h new file mode 100644 index 00000000..1dec9ca7 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/benes.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_BENES_H +#define PQCLEAN_MCELIECE6688128F_VEC_BENES_H +/* + This file is for Benes network related functions +*/ + +#include + +void PQCLEAN_MCELIECE6688128F_VEC_benes(uint64_t * /*r*/, const unsigned char * /*bits*/, int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/bm.c b/crypto_kem/mceliece6688128f/vec/bm.c new file mode 100644 index 00000000..4851a45d --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/bm.c @@ -0,0 +1,245 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline void vec_cmov(vec *out, const vec *in, uint16_t mask) { + int i; + + vec m0, m1; + + m0 = PQCLEAN_MCELIECE6688128F_VEC_vec_set1_16b(mask); + m1 = ~m0; + + for (i = 0; i < GFBITS; i++) { + out[i] = (in[i] & m0) | (out[i] & m1); + out[i] = (in[i] & m0) | (out[i] & m1); + } +} + +static inline void interleave(vec *in, int idx0, int idx1, const vec *mask, int b) { + int s = 1 << b; + + vec x, y; + + x = (in[idx0] & mask[0]) | ((in[idx1] & mask[0]) << s); + y = ((in[idx0] & mask[1]) >> s) | (in[idx1] & mask[1]); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, const vec *in) { + int i, k; + + vec mask[4][2]; + vec buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = 0; + } + + mask[0][0] = PQCLEAN_MCELIECE6688128F_VEC_vec_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6688128F_VEC_vec_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6688128F_VEC_vec_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6688128F_VEC_vec_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6688128F_VEC_vec_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6688128F_VEC_vec_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6688128F_VEC_vec_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6688128F_VEC_vec_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ k * 16 + i ] = (buf[i] >> (k * 16)) & GFMASK; + } + } +} + +static void update(vec in[][GFBITS], const gf e) { + int i; + vec tmp; + + for (i = 0; i < GFBITS; i++) { + tmp = (e >> i) & 1; + + in[0][i] = (in[0][i] >> 1) | (in[1][i] << 63); + in[1][i] = (in[1][i] >> 1) | (tmp << 63); + } +} + +static inline gf vec_reduce(vec in[][GFBITS]) { + int i; + vec tmp; + gf ret = 0; + + for (i = GFBITS - 1; i >= 0; i--) { + tmp = in[0][i] ^ in[1][i]; + + tmp ^= tmp >> 32; + tmp ^= tmp >> 16; + tmp ^= tmp >> 8; + tmp ^= tmp >> 4; + tmp ^= tmp >> 2; + tmp ^= tmp >> 1; + + ret <<= 1; + ret |= tmp & 1; + } + + return ret; +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6688128F_VEC_bm(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec prod[2][GFBITS]; + vec interval[2][GFBITS]; + vec dd[2][GFBITS], bb[2][GFBITS]; + vec B[2][GFBITS], C[2][GFBITS]; + vec B_tmp[2][GFBITS], C_tmp[2][GFBITS]; + vec v[GFBITS]; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[ 64], in[1]); + get_coefs(&coefs[128], in[2]); + get_coefs(&coefs[192], in[3]); + + C[0][0] = 0; + C[1][0] = 0; + B[0][0] = 0; + B[1][0] = one << 63; + + for (i = 1; i < GFBITS; i++) { + C[0][i] = C[1][i] = B[0][i] = B[1][i] = 0; + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[0][i] = interval[1][i] = 0; + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(prod[0], C[0], interval[0]); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(prod[1], C[1], interval[1]); + update(interval, coefs[N]); + d = vec_reduce(prod); + + t = PQCLEAN_MCELIECE6688128F_VEC_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[0][i] = dd[1][i] = PQCLEAN_MCELIECE6688128F_VEC_vec_setbits((d >> i) & 1); + bb[0][i] = bb[1][i] = PQCLEAN_MCELIECE6688128F_VEC_vec_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(B_tmp[0], dd[0], B[0]); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(B_tmp[1], dd[1], B[1]); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(C_tmp[0], bb[0], C[0]); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(C_tmp[1], bb[1], C[1]); + + vec_cmov(B[0], C[0], mask); + vec_cmov(B[1], C[1], mask); + update(B, c0 & mask); + + for (i = 0; i < GFBITS; i++) { + C[0][i] = B_tmp[0][i] ^ C_tmp[0][i]; + C[1][i] = B_tmp[1][i] ^ C_tmp[1][i]; + } + + c0 = (gf)(t >> 32); + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE6688128F_VEC_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + v[i] = PQCLEAN_MCELIECE6688128F_VEC_vec_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(out[0], C[0], v); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(out[1], C[1], v); +} + diff --git a/crypto_kem/mceliece6688128f/vec/bm.h b/crypto_kem/mceliece6688128f/vec/bm.h new file mode 100644 index 00000000..98892eba --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/bm.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_BM_H +#define PQCLEAN_MCELIECE6688128F_VEC_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE6688128F_VEC_bm(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/consts.inc b/crypto_kem/mceliece6688128f/vec/consts.inc new file mode 100644 index 00000000..1875ca4d --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/consts.inc @@ -0,0 +1,1920 @@ +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x9999999966666666, + 0x3C3CC3C3C3C33C3C, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xCC33CC3333CC33CC, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0x00FFFF0000FFFF00 +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x6666666699999999, + 0xC3C33C3C3C3CC3C3, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x33CC33CCCC33CC33, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0xFF0000FFFF0000FF +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x9669966969966996, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x6996699696699669, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x6996699696699669, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x9669966969966996, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, diff --git a/crypto_kem/mceliece6688128f/vec/controlbits.c b/crypto_kem/mceliece6688128f/vec/controlbits.c new file mode 100644 index 00000000..98730f50 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6688128F_VEC_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6688128F_VEC_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6688128F_VEC_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6688128F_VEC_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6688128f/vec/controlbits.h b/crypto_kem/mceliece6688128f/vec/controlbits.h new file mode 100644 index 00000000..da59ceda --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_CONTROLBITS_H +#define PQCLEAN_MCELIECE6688128F_VEC_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6688128F_VEC_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6688128F_VEC_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/crypto_hash.h b/crypto_kem/mceliece6688128f/vec/crypto_hash.h new file mode 100644 index 00000000..1d014c42 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6688128F_VEC_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6688128f/vec/decrypt.c b/crypto_kem/mceliece6688128f/vec/decrypt.c new file mode 100644 index 00000000..6d1368cd --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/decrypt.c @@ -0,0 +1,191 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" +#include "vec.h" + +#include + +static void scaling(vec out[][GFBITS], vec inv[][GFBITS], const unsigned char *sk, const vec *recv) { + int i, j; + + vec irr_int[2][ GFBITS ]; + vec eval[128][ GFBITS ]; + vec tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE6688128F_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6688128F_VEC_fft(eval, irr_int); + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE6688128F_VEC_vec_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6688128F_VEC_vec_copy(inv[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128F_VEC_vec_inv(tmp, inv[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128F_VEC_vec_copy(inv[0], tmp); + + // + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static void preprocess(vec *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 128; i++) { + recv[i] = PQCLEAN_MCELIECE6688128F_VEC_load8(r + i * 8); + } +} + +static void postprocess(unsigned char *e, vec *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE6688128F_VEC_store8(error8 + i * 8, err[i]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec out[][GFBITS], vec inv[][GFBITS], const vec *recv) { + int i, j; + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static int weight_check(const unsigned char *e, const vec *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < (1 << GFBITS); i++) { + w0 += (error[i / 64] >> (i % 64)) & 1; + } + + for (i = 0; i < SYS_N; i++) { + w1 += (e[i / 8] >> (i % 8)) & 1; + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec s0[][ GFBITS ], vec s1[][ GFBITS ]) { + int i, j; + vec diff = 0; + + for (i = 0; i < 4; i++) { + for (j = 0; j < GFBITS; j++) { + diff |= (s0[i][j] ^ s1[i][j]); + } + } + + return (uint16_t)PQCLEAN_MCELIECE6688128F_VEC_vec_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6688128F_VEC_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec inv[ 128 ][ GFBITS ]; + vec scaled[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + + vec error[ 128 ]; + + vec s_priv[ 4 ][ GFBITS ]; + vec s_priv_cmp[ 4 ][ GFBITS ]; + vec locator[2][ GFBITS ]; + + vec recv[ 128 ]; + vec allone; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE6688128F_VEC_benes(recv, sk + IRR_BYTES, 1); + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE6688128F_VEC_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6688128F_VEC_bm(locator, s_priv); + + PQCLEAN_MCELIECE6688128F_VEC_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6688128F_VEC_vec_setbits(1); + + for (i = 0; i < 128; i++) { + error[i] = PQCLEAN_MCELIECE6688128F_VEC_vec_or_reduce(eval[i]); + error[i] ^= allone; + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE6688128F_VEC_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE6688128F_VEC_benes(error, sk + IRR_BYTES, 0); + + postprocess(e, error); + + check_weight = (uint16_t)weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece6688128f/vec/decrypt.h b/crypto_kem/mceliece6688128f/vec/decrypt.h new file mode 100644 index 00000000..bcac4a59 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_DECRYPT_H +#define PQCLEAN_MCELIECE6688128F_VEC_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6688128F_VEC_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/encrypt.c b/crypto_kem/mceliece6688128f/vec/encrypt.c new file mode 100644 index 00000000..1bb530e9 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/encrypt.c @@ -0,0 +1,142 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind[ SYS_T * 2 ]; + uint8_t *ind8 = (uint8_t *)ind; + uint32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes(ind8, sizeof(ind)); + for (i = 0; i < sizeof(ind); i += 2) { + ind[i / 2] = (uint16_t)ind8[i + 1] << 8 | ind8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind32[i] == ind32[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6688128F_VEC_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + uint64_t b; + + const uint8_t *e_ptr8 = e + SYND_BYTES; + const uint8_t *pk_ptr8; + + int i, j; + + // + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = e[i]; + } + + for (i = 0; i < PK_NROWS; i++) { + pk_ptr8 = pk + PK_ROW_BYTES * i; + + b = 0; + for (j = 0; j < PK_NCOLS / 64; j++) { + b ^= PQCLEAN_MCELIECE6688128F_VEC_load8(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE6688128F_VEC_load8(e_ptr8 + 8 * j); + } + + b ^= PQCLEAN_MCELIECE6688128F_VEC_load4(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE6688128F_VEC_load4(e_ptr8 + 8 * j); + + b ^= b >> 32; + b ^= b >> 16; + b ^= b >> 8; + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] ^= (b << (i % 8)); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6688128F_VEC_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece6688128f/vec/encrypt.h b/crypto_kem/mceliece6688128f/vec/encrypt.h new file mode 100644 index 00000000..20f8b723 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_ENCRYPT_H +#define PQCLEAN_MCELIECE6688128F_VEC_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6688128F_VEC_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/fft.c b/crypto_kem/mceliece6688128f/vec/fft.c new file mode 100644 index 00000000..320663d7 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/fft.c @@ -0,0 +1,274 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec in[][GFBITS]) { + int i, j, k; + + const vec mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const vec s[5][2][GFBITS] = { +#include "scalars_2x.inc" + }; + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[1][i] >> 32; + in[0][i] ^= in[1][i] << 32; + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[0][i] ^= (in[0][i] & mask[k][0]) >> (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) >> (1 << k); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(in[0], in[0], s[j][0]); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(in[1], in[1], s[j][1]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[8][ GFBITS ]; + vec buf[128]; + + uint64_t consts_ptr = 2; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec powers[ 128 ][ GFBITS ] = { +#include "powers.inc" + }; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[7] = {2522, 7827, 7801, 8035, 6897, 8167, 3476}; + + // + + for (i = 0; i < 7; i++) { + for (j = 0; j < GFBITS; j++) { + pre[i][j] = (beta[i] >> j) & 1; + pre[i][j] = -pre[i][j]; + } + + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(pre[i], in[1], pre[i]); + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = in[0][i]; + + buf[1] = buf[0] ^ pre[0][i]; + buf[32] = in[0][i] ^ pre[5][i]; + buf[3] = buf[1] ^ pre[1][i]; + buf[96] = buf[32] ^ pre[6][i]; + buf[97] = buf[96] ^ pre[0][i]; + buf[2] = in[0][i] ^ pre[1][i]; + buf[99] = buf[97] ^ pre[1][i]; + buf[6] = buf[2] ^ pre[2][i]; + buf[98] = buf[99] ^ pre[0][i]; + buf[7] = buf[6] ^ pre[0][i]; + buf[102] = buf[98] ^ pre[2][i]; + buf[5] = buf[7] ^ pre[1][i]; + buf[103] = buf[102] ^ pre[0][i]; + buf[101] = buf[103] ^ pre[1][i]; + buf[4] = in[0][i] ^ pre[2][i]; + buf[100] = buf[101] ^ pre[0][i]; + buf[12] = buf[4] ^ pre[3][i]; + buf[108] = buf[100] ^ pre[3][i]; + buf[13] = buf[12] ^ pre[0][i]; + buf[109] = buf[108] ^ pre[0][i]; + buf[15] = buf[13] ^ pre[1][i]; + buf[111] = buf[109] ^ pre[1][i]; + buf[14] = buf[15] ^ pre[0][i]; + buf[110] = buf[111] ^ pre[0][i]; + buf[10] = buf[14] ^ pre[2][i]; + buf[106] = buf[110] ^ pre[2][i]; + buf[11] = buf[10] ^ pre[0][i]; + buf[107] = buf[106] ^ pre[0][i]; + buf[9] = buf[11] ^ pre[1][i]; + buf[105] = buf[107] ^ pre[1][i]; + buf[104] = buf[105] ^ pre[0][i]; + buf[8] = in[0][i] ^ pre[3][i]; + buf[120] = buf[104] ^ pre[4][i]; + buf[24] = buf[8] ^ pre[4][i]; + buf[121] = buf[120] ^ pre[0][i]; + buf[25] = buf[24] ^ pre[0][i]; + buf[123] = buf[121] ^ pre[1][i]; + buf[27] = buf[25] ^ pre[1][i]; + buf[122] = buf[123] ^ pre[0][i]; + buf[26] = buf[27] ^ pre[0][i]; + buf[126] = buf[122] ^ pre[2][i]; + buf[30] = buf[26] ^ pre[2][i]; + buf[127] = buf[126] ^ pre[0][i]; + buf[31] = buf[30] ^ pre[0][i]; + buf[125] = buf[127] ^ pre[1][i]; + buf[29] = buf[31] ^ pre[1][i]; + buf[124] = buf[125] ^ pre[0][i]; + buf[28] = buf[29] ^ pre[0][i]; + buf[116] = buf[124] ^ pre[3][i]; + buf[20] = buf[28] ^ pre[3][i]; + buf[117] = buf[116] ^ pre[0][i]; + buf[21] = buf[20] ^ pre[0][i]; + buf[119] = buf[117] ^ pre[1][i]; + buf[23] = buf[21] ^ pre[1][i]; + buf[118] = buf[119] ^ pre[0][i]; + buf[22] = buf[23] ^ pre[0][i]; + buf[114] = buf[118] ^ pre[2][i]; + buf[18] = buf[22] ^ pre[2][i]; + buf[115] = buf[114] ^ pre[0][i]; + buf[19] = buf[18] ^ pre[0][i]; + buf[113] = buf[115] ^ pre[1][i]; + buf[17] = buf[19] ^ pre[1][i]; + buf[112] = buf[113] ^ pre[0][i]; + buf[80] = buf[112] ^ pre[5][i]; + buf[16] = in[0][i] ^ pre[4][i]; + buf[81] = buf[80] ^ pre[0][i]; + buf[48] = buf[16] ^ pre[5][i]; + buf[83] = buf[81] ^ pre[1][i]; + buf[49] = buf[48] ^ pre[0][i]; + buf[82] = buf[83] ^ pre[0][i]; + buf[51] = buf[49] ^ pre[1][i]; + buf[86] = buf[82] ^ pre[2][i]; + buf[50] = buf[51] ^ pre[0][i]; + buf[87] = buf[86] ^ pre[0][i]; + buf[54] = buf[50] ^ pre[2][i]; + buf[85] = buf[87] ^ pre[1][i]; + buf[55] = buf[54] ^ pre[0][i]; + buf[84] = buf[85] ^ pre[0][i]; + buf[53] = buf[55] ^ pre[1][i]; + buf[92] = buf[84] ^ pre[3][i]; + buf[52] = buf[53] ^ pre[0][i]; + buf[93] = buf[92] ^ pre[0][i]; + buf[60] = buf[52] ^ pre[3][i]; + buf[95] = buf[93] ^ pre[1][i]; + buf[61] = buf[60] ^ pre[0][i]; + buf[94] = buf[95] ^ pre[0][i]; + buf[63] = buf[61] ^ pre[1][i]; + buf[90] = buf[94] ^ pre[2][i]; + buf[62] = buf[63] ^ pre[0][i]; + buf[91] = buf[90] ^ pre[0][i]; + buf[58] = buf[62] ^ pre[2][i]; + buf[89] = buf[91] ^ pre[1][i]; + buf[59] = buf[58] ^ pre[0][i]; + buf[88] = buf[89] ^ pre[0][i]; + buf[57] = buf[59] ^ pre[1][i]; + buf[72] = buf[88] ^ pre[4][i]; + buf[56] = buf[57] ^ pre[0][i]; + buf[73] = buf[72] ^ pre[0][i]; + buf[40] = buf[56] ^ pre[4][i]; + buf[75] = buf[73] ^ pre[1][i]; + buf[41] = buf[40] ^ pre[0][i]; + buf[74] = buf[75] ^ pre[0][i]; + buf[43] = buf[41] ^ pre[1][i]; + buf[78] = buf[74] ^ pre[2][i]; + buf[42] = buf[43] ^ pre[0][i]; + buf[79] = buf[78] ^ pre[0][i]; + buf[46] = buf[42] ^ pre[2][i]; + buf[77] = buf[79] ^ pre[1][i]; + buf[47] = buf[46] ^ pre[0][i]; + buf[76] = buf[77] ^ pre[0][i]; + buf[45] = buf[47] ^ pre[1][i]; + buf[68] = buf[76] ^ pre[3][i]; + buf[44] = buf[45] ^ pre[0][i]; + buf[69] = buf[68] ^ pre[0][i]; + buf[36] = buf[44] ^ pre[3][i]; + buf[71] = buf[69] ^ pre[1][i]; + buf[37] = buf[36] ^ pre[0][i]; + buf[70] = buf[71] ^ pre[0][i]; + buf[39] = buf[37] ^ pre[1][i]; + buf[66] = buf[70] ^ pre[2][i]; + buf[38] = buf[39] ^ pre[0][i]; + buf[67] = buf[66] ^ pre[0][i]; + buf[34] = buf[38] ^ pre[2][i]; + buf[65] = buf[67] ^ pre[1][i]; + buf[35] = buf[34] ^ pre[0][i]; + buf[33] = buf[35] ^ pre[1][i]; + buf[64] = in[0][i] ^ pre[6][i]; + + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(buf + 0, buf + 0); + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(buf + 64, buf + 64); + + for (j = 0; j < 128; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 1; i <= 6; i++) { + s = 1 << i; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += ((uint64_t)1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 128; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] ^= powers[i][b]; + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6688128F_VEC_fft(vec out[][GFBITS], vec in[][GFBITS]) { + radix_conversions(in); + butterflies(out, in); +} diff --git a/crypto_kem/mceliece6688128f/vec/fft.h b/crypto_kem/mceliece6688128f/vec/fft.h new file mode 100644 index 00000000..f61c0135 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_FFT_H +#define PQCLEAN_MCELIECE6688128F_VEC_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE6688128F_VEC_fft(vec out[][ GFBITS ], vec in[][GFBITS]); + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/fft_tr.c b/crypto_kem/mceliece6688128f/vec/fft_tr.c new file mode 100644 index 00000000..3565cb9f --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/fft_tr.c @@ -0,0 +1,289 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec in[][ GFBITS ]) { + int i, j, k; + + const vec mask[6][2] = { + {0x2222222222222222, 0x4444444444444444}, + {0x0C0C0C0C0C0C0C0C, 0x3030303030303030}, + {0x00F000F000F000F0, 0x0F000F000F000F00}, + {0x0000FF000000FF00, 0x00FF000000FF0000}, + {0x00000000FFFF0000, 0x0000FFFF00000000}, + {0xFFFFFFFF00000000, 0x00000000FFFFFFFF} + }; + + const vec s[6][4][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(in[1], in[1], s[j][1]); // scaling + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(in[2], in[2], s[j][2]); // scaling + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(in[3], in[3], s[j][3]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + in[0][i] ^= (in[0][i] & mask[k][0]) << (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][0]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][1]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][0]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][1]) << (1 << k); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[0][i] >> 32; + in[1][i] ^= in[1][i] << 32; + + in[3][i] ^= in[2][i] >> 32; + in[3][i] ^= in[3][i] << 32; + } + } + + for (i = 0; i < GFBITS; i++) { + in[3][i] ^= in[2][i] ^= in[1][i]; + } + } +} + +static void butterflies_tr(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[6][2][ GFBITS ]; + vec buf[2][64]; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 128; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 6; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tmp[b]; + } + } + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 128; k++) { + (&buf[0][0])[ k ] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(buf[0], buf[0]); + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(buf[1], buf[1]); + + for (k = 0; k < 2; k++) { + pre[0][k][i] = buf[k][32]; + buf[k][33] ^= buf[k][32]; + pre[1][k][i] = buf[k][33]; + buf[k][35] ^= buf[k][33]; + pre[0][k][i] ^= buf[k][35]; + buf[k][34] ^= buf[k][35]; + pre[2][k][i] = buf[k][34]; + buf[k][38] ^= buf[k][34]; + pre[0][k][i] ^= buf[k][38]; + buf[k][39] ^= buf[k][38]; + pre[1][k][i] ^= buf[k][39]; + buf[k][37] ^= buf[k][39]; + pre[0][k][i] ^= buf[k][37]; + buf[k][36] ^= buf[k][37]; + pre[3][k][i] = buf[k][36]; + buf[k][44] ^= buf[k][36]; + pre[0][k][i] ^= buf[k][44]; + buf[k][45] ^= buf[k][44]; + pre[1][k][i] ^= buf[k][45]; + buf[k][47] ^= buf[k][45]; + pre[0][k][i] ^= buf[k][47]; + buf[k][46] ^= buf[k][47]; + pre[2][k][i] ^= buf[k][46]; + buf[k][42] ^= buf[k][46]; + pre[0][k][i] ^= buf[k][42]; + buf[k][43] ^= buf[k][42]; + pre[1][k][i] ^= buf[k][43]; + buf[k][41] ^= buf[k][43]; + pre[0][k][i] ^= buf[k][41]; + buf[k][40] ^= buf[k][41]; + pre[4][k][i] = buf[k][40]; + buf[k][56] ^= buf[k][40]; + pre[0][k][i] ^= buf[k][56]; + buf[k][57] ^= buf[k][56]; + pre[1][k][i] ^= buf[k][57]; + buf[k][59] ^= buf[k][57]; + pre[0][k][i] ^= buf[k][59]; + buf[k][58] ^= buf[k][59]; + pre[2][k][i] ^= buf[k][58]; + buf[k][62] ^= buf[k][58]; + pre[0][k][i] ^= buf[k][62]; + buf[k][63] ^= buf[k][62]; + pre[1][k][i] ^= buf[k][63]; + buf[k][61] ^= buf[k][63]; + pre[0][k][i] ^= buf[k][61]; + buf[k][60] ^= buf[k][61]; + pre[3][k][i] ^= buf[k][60]; + buf[k][52] ^= buf[k][60]; + pre[0][k][i] ^= buf[k][52]; + buf[k][53] ^= buf[k][52]; + pre[1][k][i] ^= buf[k][53]; + buf[k][55] ^= buf[k][53]; + pre[0][k][i] ^= buf[k][55]; + buf[k][54] ^= buf[k][55]; + pre[2][k][i] ^= buf[k][54]; + buf[k][50] ^= buf[k][54]; + pre[0][k][i] ^= buf[k][50]; + buf[k][51] ^= buf[k][50]; + pre[1][k][i] ^= buf[k][51]; + buf[k][49] ^= buf[k][51]; + pre[0][k][i] ^= buf[k][49]; + buf[k][48] ^= buf[k][49]; + pre[5][k][i] = buf[k][48]; + buf[k][16] ^= buf[k][48]; + pre[0][k][i] ^= buf[k][16]; + buf[k][17] ^= buf[k][16]; + pre[1][k][i] ^= buf[k][17]; + buf[k][19] ^= buf[k][17]; + pre[0][k][i] ^= buf[k][19]; + buf[k][18] ^= buf[k][19]; + pre[2][k][i] ^= buf[k][18]; + buf[k][22] ^= buf[k][18]; + pre[0][k][i] ^= buf[k][22]; + buf[k][23] ^= buf[k][22]; + pre[1][k][i] ^= buf[k][23]; + buf[k][21] ^= buf[k][23]; + pre[0][k][i] ^= buf[k][21]; + buf[k][20] ^= buf[k][21]; + pre[3][k][i] ^= buf[k][20]; + buf[k][28] ^= buf[k][20]; + pre[0][k][i] ^= buf[k][28]; + buf[k][29] ^= buf[k][28]; + pre[1][k][i] ^= buf[k][29]; + buf[k][31] ^= buf[k][29]; + pre[0][k][i] ^= buf[k][31]; + buf[k][30] ^= buf[k][31]; + pre[2][k][i] ^= buf[k][30]; + buf[k][26] ^= buf[k][30]; + pre[0][k][i] ^= buf[k][26]; + buf[k][27] ^= buf[k][26]; + pre[1][k][i] ^= buf[k][27]; + buf[k][25] ^= buf[k][27]; + pre[0][k][i] ^= buf[k][25]; + buf[k][24] ^= buf[k][25]; + pre[4][k][i] ^= buf[k][24]; + buf[k][8] ^= buf[k][24]; + pre[0][k][i] ^= buf[k][8]; + buf[k][9] ^= buf[k][8]; + pre[1][k][i] ^= buf[k][9]; + buf[k][11] ^= buf[k][9]; + pre[0][k][i] ^= buf[k][11]; + buf[k][10] ^= buf[k][11]; + pre[2][k][i] ^= buf[k][10]; + buf[k][14] ^= buf[k][10]; + pre[0][k][i] ^= buf[k][14]; + buf[k][15] ^= buf[k][14]; + pre[1][k][i] ^= buf[k][15]; + buf[k][13] ^= buf[k][15]; + pre[0][k][i] ^= buf[k][13]; + buf[k][12] ^= buf[k][13]; + pre[3][k][i] ^= buf[k][12]; + buf[k][4] ^= buf[k][12]; + pre[0][k][i] ^= buf[k][4]; + buf[k][5] ^= buf[k][4]; + pre[1][k][i] ^= buf[k][5]; + buf[k][7] ^= buf[k][5]; + pre[0][k][i] ^= buf[k][7]; + buf[k][6] ^= buf[k][7]; + pre[2][k][i] ^= buf[k][6]; + buf[k][2] ^= buf[k][6]; + pre[0][k][i] ^= buf[k][2]; + buf[k][3] ^= buf[k][2]; + pre[1][k][i] ^= buf[k][3]; + buf[k][1] ^= buf[k][3]; + + pre[0][k][i] ^= buf[k][1]; + out[k][i] = buf[k][0] ^ buf[k][1]; + } + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128F_VEC_vec_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(out[2], pre[0][0], tmp); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(out[3], pre[0][1], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128F_VEC_vec_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(pre[i][0], pre[i][0], tmp); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(pre[i][1], pre[i][1], tmp); + + for (b = 0; b < GFBITS; b++) { + out[2][b] ^= pre[i][0][b]; + out[3][b] ^= pre[i][1][b]; + } + } + +} + +void PQCLEAN_MCELIECE6688128F_VEC_fft_tr(vec out[][GFBITS], vec in[][ GFBITS ]) { + butterflies_tr(out, in); + + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece6688128f/vec/fft_tr.h b/crypto_kem/mceliece6688128f/vec/fft_tr.h new file mode 100644 index 00000000..374fe29e --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_FFT_TR_H +#define PQCLEAN_MCELIECE6688128F_VEC_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE6688128F_VEC_fft_tr(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/gf.c b/crypto_kem/mceliece6688128f/vec/gf.c new file mode 100644 index 00000000..7f7470ac --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* field multiplication */ +gf PQCLEAN_MCELIECE6688128F_VEC_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE6688128F_VEC_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6688128F_VEC_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6688128F_VEC_gf_inv(gf in) { + return PQCLEAN_MCELIECE6688128F_VEC_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6688128F_VEC_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6688128F_VEC_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6688128F_VEC_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE6688128F_VEC_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE6688128F_VEC_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE6688128F_VEC_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6688128f/vec/gf.h b/crypto_kem/mceliece6688128f/vec/gf.h new file mode 100644 index 00000000..02504ef5 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_GF_H +#define PQCLEAN_MCELIECE6688128F_VEC_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6688128F_VEC_gf_iszero(gf a); +gf PQCLEAN_MCELIECE6688128F_VEC_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE6688128F_VEC_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE6688128F_VEC_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE6688128F_VEC_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE6688128F_VEC_gf_inv(gf in); + +void PQCLEAN_MCELIECE6688128F_VEC_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/operations.c b/crypto_kem/mceliece6688128f/vec/operations.c new file mode 100644 index 00000000..9db050f5 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6688128F_VEC_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6688128F_VEC_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6688128F_VEC_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6688128F_VEC_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6688128F_VEC_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6688128F_VEC_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6688128F_VEC_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6688128F_VEC_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6688128F_VEC_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6688128F_VEC_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128f/vec/params.h b/crypto_kem/mceliece6688128f/vec/params.h new file mode 100644 index 00000000..2f5a0c31 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_PARAMS_H +#define PQCLEAN_MCELIECE6688128F_VEC_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6688 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/pk_gen.c b/crypto_kem/mceliece6688128f/vec/pk_gen.c new file mode 100644 index 00000000..55880b23 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/pk_gen.c @@ -0,0 +1,304 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec in[][GFBITS]) { + int i, j, r; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 0; r < 64; r++) { + out[i * 64 + r] <<= 1; + out[i * 64 + r] |= (in[i][j] >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec out0[][GFBITS], vec out1[][GFBITS], const uint64_t *in) { + int i, j, r; + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out1[i][j] <<= 1; + out1[i][j] |= (in[i * 64 + r] >> (j + GFBITS)) & 1; + } + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out0[i][GFBITS - 1 - j] <<= 1; + out0[i][GFBITS - 1 - j] |= (in[i * 64 + r] >> j) & 1; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + int i, b, m = 0, r = 0; + + for (i = 0; i < 64; i++) { + b = (int)(in >> i) & 1; + m |= b; + r += (m ^ 1) & (b ^ 1); + } + + return r; +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ (SYS_N + 63) / 64 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << 32 >> 32) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> 32 << 32) | (buf[j] >> 32); + } + } + + return 0; +} + +int PQCLEAN_MCELIECE6688128F_VEC_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { +#define NBLOCKS_H ((SYS_N + 63) / 64) +#define NBLOCKS_I ((GFBITS * SYS_T + 63) / 64) + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS_H ]; + + uint64_t mask; + + vec irr_int[2][ GFBITS ]; + + vec consts[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + vec prod[ 128 ][ GFBITS ]; + vec tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE6688128F_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6688128F_VEC_fft(eval, irr_int); + + PQCLEAN_MCELIECE6688128F_VEC_vec_copy(prod[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128F_VEC_vec_inv(tmp, prod[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128F_VEC_vec_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6688128F_VEC_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS_H; j++) { + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = NBLOCKS_I; j < NBLOCKS_H - 1; j++) { + PQCLEAN_MCELIECE6688128F_VEC_store8(pk, mat[i][j]); + pk += 8; + } + + PQCLEAN_MCELIECE6688128F_VEC_store_i(pk, mat[i][j], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece6688128f/vec/pk_gen.h b/crypto_kem/mceliece6688128f/vec/pk_gen.h new file mode 100644 index 00000000..8e9adeda --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_PK_GEN_H +#define PQCLEAN_MCELIECE6688128F_VEC_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE6688128F_VEC_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/powers.inc b/crypto_kem/mceliece6688128f/vec/powers.inc new file mode 100644 index 00000000..a9bd6179 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/powers.inc @@ -0,0 +1,1920 @@ +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +} diff --git a/crypto_kem/mceliece6688128f/vec/scalars_2x.inc b/crypto_kem/mceliece6688128f/vec/scalars_2x.inc new file mode 100644 index 00000000..a0abb162 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/scalars_2x.inc @@ -0,0 +1,150 @@ +{{ + 0X3C3CF30C0000C003, + 0X0CCCC3F333C0000C, + 0X03C33F33FCC0C03C, + 0X0003000F3C03C0C0, + 0XF33FF33030CF03F0, + 0X0CF0303300F0CCC0, + 0XFF3F0C0CC0FF3CC0, + 0XCF3CF0FF003FC000, + 0XC00FF3CF0303F300, + 0X3CCC0CC00CF0CC00, + 0XF30FFC3C3FCCFC00, + 0X3F0FC3F0CCF0C000, + 0X3000FF33CCF0F000 +}, +{ + 0X0C0F0FCF0F0CF330, + 0XF0000FC33C3CCF3C, + 0X3C0F3F00C3C300FC, + 0X3C33CCC0F0F3CC30, + 0XC0CFFFFFCCCC30CC, + 0X3FC3F3CCFFFC033F, + 0XFC3030CCCCC0CFCF, + 0X0FCF0C00CCF333C3, + 0XCFFCF33000CFF030, + 0X00CFFCC330F30FCC, + 0X3CCC3FCCC0F3FFF3, + 0XF00F0C3FC003C0FF, + 0X330CCFCC03C0FC33 +}}, +{{ + 0X0F0F0FF0F000000F, + 0X00FFFFFFFF0000F0, + 0XFFFF00FF00000F00, + 0XFFF000F00F0FF000, + 0XFFF0000F0FF000F0, + 0X00FF000FFF000000, + 0XFF0F0FFF0F0FF000, + 0X0FFF0000000F0000, + 0X00F000F0FFF00F00, + 0X00F00FF00F00F000, + 0XFFF000F000F00000, + 0X00F00F000FF00000, + 0X0000FF0F0000F000 +}, +{ + 0XF0FFFFFFF0F00F00, + 0X00FFF0FFFF0000FF, + 0X00FF00000F0F0FFF, + 0XF000F0000F00FF0F, + 0XFF000000FFF00000, + 0XF0FF000FF00F0FF0, + 0X0F0F0F00FF000F0F, + 0X0F0F00F0F0F0F000, + 0X00F00F00F00F000F, + 0X00F0F0F00000FFF0, + 0XFFFFFF0FF00F0FFF, + 0X0F0FFFF00FFFFFFF, + 0XFFFF0F0FFF0FFF00 +}}, +{{ + 0X00FF0000000000FF, + 0XFFFFFFFFFF00FF00, + 0XFF0000FF00FF0000, + 0XFFFF000000FF0000, + 0XFF00000000FF0000, + 0X00FFFFFFFF000000, + 0XFF0000FFFFFF0000, + 0XFF00FF00FFFF0000, + 0X00FFFFFFFF00FF00, + 0XFFFF000000000000, + 0X00FF0000FF000000, + 0XFF00FF00FF000000, + 0X00FF00FFFF000000 +}, +{ + 0X00FF00FF00FF0000, + 0XFF00FFFF000000FF, + 0X0000FFFF000000FF, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF00FF, + 0X0000FFFF00FFFF00, + 0XFF00FF0000FFFF00, + 0X00000000FFFFFFFF, + 0X0000FF0000000000, + 0XFF00FFFF00FFFF00, + 0X00FFFF00000000FF, + 0X0000FF00FF00FFFF, + 0XFF0000FFFFFF0000 +}}, +{{ + 0X000000000000FFFF, + 0XFFFFFFFFFFFF0000, + 0X0000000000000000, + 0XFFFF0000FFFF0000, + 0XFFFFFFFFFFFF0000, + 0X0000FFFF00000000, + 0X0000FFFFFFFF0000, + 0XFFFF0000FFFF0000, + 0X0000FFFF00000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000FFFF00000000, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0X0000FFFF00000000, + 0XFFFF0000FFFF0000, + 0X0000FFFFFFFF0000, + 0X0000FFFF0000FFFF, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFF0000, + 0XFFFF0000FFFFFFFF, + 0XFFFF0000FFFFFFFF, + 0X0000000000000000 +}}, +{{ + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000 +}} diff --git a/crypto_kem/mceliece6688128f/vec/scalars_4x.inc b/crypto_kem/mceliece6688128f/vec/scalars_4x.inc new file mode 100644 index 00000000..cbaccec7 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/scalars_4x.inc @@ -0,0 +1,360 @@ +{{ + 0x3C3CF30C0000C003, + 0x0CCCC3F333C0000C, + 0x03C33F33FCC0C03C, + 0x0003000F3C03C0C0, + 0xF33FF33030CF03F0, + 0x0CF0303300F0CCC0, + 0xFF3F0C0CC0FF3CC0, + 0xCF3CF0FF003FC000, + 0xC00FF3CF0303F300, + 0x3CCC0CC00CF0CC00, + 0xF30FFC3C3FCCFC00, + 0x3F0FC3F0CCF0C000, + 0x3000FF33CCF0F000 +}, +{ + 0x0C0F0FCF0F0CF330, + 0xF0000FC33C3CCF3C, + 0x3C0F3F00C3C300FC, + 0x3C33CCC0F0F3CC30, + 0xC0CFFFFFCCCC30CC, + 0x3FC3F3CCFFFC033F, + 0xFC3030CCCCC0CFCF, + 0x0FCF0C00CCF333C3, + 0xCFFCF33000CFF030, + 0x00CFFCC330F30FCC, + 0x3CCC3FCCC0F3FFF3, + 0xF00F0C3FC003C0FF, + 0x330CCFCC03C0FC33 +}, +{ + 0xF0F30C33CF03F03F, + 0x00F30FC00C3300FF, + 0xF3CC3CF3F3FCF33F, + 0x3C0FC0FC303C3F3C, + 0xFC30CF303F3FF00F, + 0x33300C0CC3300CF3, + 0x3C030CF3F03FF3F3, + 0x3CCC03FCCC3FFC03, + 0x033C3C3CF0003FC3, + 0xFFC0FF00F0FF0F03, + 0xF3F30CF003FCC303, + 0x30CFCFC3CC0F3000, + 0x0CF30CCF3FCFCC0F +}, +{ + 0x3F30CC0C000F3FCC, + 0xFC3CF030FC3FFF03, + 0x33FFFCFF0CCF3CC3, + 0x003CFF33C3CC30CF, + 0xCFF3CF33C00F3003, + 0x00F3CC0CF3003CCF, + 0x3C000CFCCC3C3333, + 0xF3CF03C0FCF03FF0, + 0x3F3C3CF0C330330C, + 0x33CCFCC0FF0033F0, + 0x33C300C0F0C003F3, + 0x003FF0003F00C00C, + 0xCFF3C3033F030FFF +}}, +{{ + 0x0F0F0FF0F000000F, + 0x00FFFFFFFF0000F0, + 0xFFFF00FF00000F00, + 0xFFF000F00F0FF000, + 0xFFF0000F0FF000F0, + 0x00FF000FFF000000, + 0xFF0F0FFF0F0FF000, + 0x0FFF0000000F0000, + 0x00F000F0FFF00F00, + 0x00F00FF00F00F000, + 0xFFF000F000F00000, + 0x00F00F000FF00000, + 0x0000FF0F0000F000 +}, +{ + 0xF0FFFFFFF0F00F00, + 0x00FFF0FFFF0000FF, + 0x00FF00000F0F0FFF, + 0xF000F0000F00FF0F, + 0xFF000000FFF00000, + 0xF0FF000FF00F0FF0, + 0x0F0F0F00FF000F0F, + 0x0F0F00F0F0F0F000, + 0x00F00F00F00F000F, + 0x00F0F0F00000FFF0, + 0xFFFFFF0FF00F0FFF, + 0x0F0FFFF00FFFFFFF, + 0xFFFF0F0FFF0FFF00 +}, +{ + 0x0F0F00FF0FF0FFFF, + 0xF000F0F00F00FF0F, + 0x000FFFF0FFF0FF0F, + 0x00F00FFF00000FF0, + 0xFFFFF0000FFFF00F, + 0xFFF0FFF0000FFFF0, + 0xF0F0F0000F0F0F00, + 0x00F000F0F00FFF00, + 0xF0FF0F0FFF00F0FF, + 0xF0FF0FFFF0F0F0FF, + 0x00FFFFFFFFFFFFF0, + 0x00FFF0F0FF000F0F, + 0x000FFFF0000FFF00 +}, +{ + 0xFF0F0F00F000F0FF, + 0x0FFFFFFFFF00000F, + 0xF0FFFF000F00F0FF, + 0x0F0000F00FFF0FFF, + 0x0F0F0F00FF0F000F, + 0x000F0F0FFFF0F000, + 0xF0FFFF0F00F0FF0F, + 0x0F0F000F0F00F0FF, + 0x0000F0FF00FF0F0F, + 0x00FFFF0FF0FFF0F0, + 0x0000000F00F0FFF0, + 0xF0F00000FF00F0F0, + 0x0F0F0FFFFFFFFFFF +}}, +{{ + 0x00FF0000000000FF, + 0xFFFFFFFFFF00FF00, + 0xFF0000FF00FF0000, + 0xFFFF000000FF0000, + 0xFF00000000FF0000, + 0x00FFFFFFFF000000, + 0xFF0000FFFFFF0000, + 0xFF00FF00FFFF0000, + 0x00FFFFFFFF00FF00, + 0xFFFF000000000000, + 0x00FF0000FF000000, + 0xFF00FF00FF000000, + 0x00FF00FFFF000000 +}, +{ + 0x00FF00FF00FF0000, + 0xFF00FFFF000000FF, + 0x0000FFFF000000FF, + 0x00FFFF00FF000000, + 0xFFFFFF0000FF00FF, + 0x0000FFFF00FFFF00, + 0xFF00FF0000FFFF00, + 0x00000000FFFFFFFF, + 0x0000FF0000000000, + 0xFF00FFFF00FFFF00, + 0x00FFFF00000000FF, + 0x0000FF00FF00FFFF, + 0xFF0000FFFFFF0000 +}, +{ + 0xFFFF00FF00FF00FF, + 0x00FFFF000000FF00, + 0xFFFF00FFFFFFFF00, + 0x0000FFFF00FFFFFF, + 0x00FF0000FF0000FF, + 0xFFFF0000FF00FFFF, + 0xFF000000FFFFFF00, + 0x000000000000FFFF, + 0xFF00FF00FFFF0000, + 0xFFFF00FFFF00FFFF, + 0xFFFFFFFFFF00FF00, + 0xFFFF00FFFF0000FF, + 0x0000FF00000000FF +}, +{ + 0xFF0000FFFFFF00FF, + 0xFFFF0000FFFFFFFF, + 0xFFFF000000FFFFFF, + 0x00FFFF00FF0000FF, + 0xFFFFFF00FFFFFF00, + 0x00FFFF00FFFF00FF, + 0x0000FFFF00FF0000, + 0x000000FFFF000000, + 0xFF00FF0000FF00FF, + 0x00FF0000000000FF, + 0xFF00FFFF00FF00FF, + 0xFFFFFFFFFFFFFFFF, + 0x0000FF000000FFFF +}}, +{{ + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0x0000000000000000, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFF0000, + 0x0000FFFF00000000, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFF0000, + 0x0000FFFF00000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000FFFF00000000, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000FFFF00000000, + 0xFFFF0000FFFF0000, + 0x0000FFFFFFFF0000, + 0x0000FFFF0000FFFF, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFF0000, + 0xFFFF0000FFFFFFFF, + 0xFFFF0000FFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF000000000000, + 0x0000FFFF00000000, + 0x00000000FFFF0000, + 0x0000FFFFFFFFFFFF, + 0x0000FFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0x000000000000FFFF, + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0xFFFFFFFF0000FFFF, + 0xFFFF0000FFFFFFFF +}, +{ + 0x0000FFFFFFFFFFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFFFFFF, + 0x00000000FFFF0000, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFF00000000, + 0xFFFFFFFF00000000, + 0x0000FFFFFFFF0000, + 0x0000FFFFFFFFFFFF +}}, +{{ + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000 +}, +{ + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000 +}}, +{{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF +}}, diff --git a/crypto_kem/mceliece6688128f/vec/sk_gen.c b/crypto_kem/mceliece6688128f/vec/sk_gen.c new file mode 100644 index 00000000..7282da40 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6688128F_VEC_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6688128F_VEC_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6688128F_VEC_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6688128F_VEC_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6688128F_VEC_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6688128F_VEC_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6688128F_VEC_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6688128F_VEC_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6688128f/vec/sk_gen.h b/crypto_kem/mceliece6688128f/vec/sk_gen.h new file mode 100644 index 00000000..c4619041 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_SK_GEN_H +#define PQCLEAN_MCELIECE6688128F_VEC_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6688128F_VEC_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6688128F_VEC_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/transpose.c b/crypto_kem/mceliece6688128f/vec/transpose.c new file mode 100644 index 00000000..7c47b796 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/transpose.c @@ -0,0 +1,35 @@ +#include "transpose.h" + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} diff --git a/crypto_kem/mceliece6688128f/vec/transpose.h b/crypto_kem/mceliece6688128f/vec/transpose.h new file mode 100644 index 00000000..1281455b --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/transpose.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_TRANSPOSE_H +#define PQCLEAN_MCELIECE6688128F_VEC_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + + +void PQCLEAN_MCELIECE6688128F_VEC_transpose_64x64(uint64_t *out, const uint64_t *in); + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/util.c b/crypto_kem/mceliece6688128f/vec/util.c new file mode 100644 index 00000000..1525f7bb --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/util.c @@ -0,0 +1,97 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ +#include "util.h" + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE6688128F_VEC_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6688128F_VEC_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6688128F_VEC_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6688128F_VEC_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6688128F_VEC_irr_load(vec out[][GFBITS], const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6688128F_VEC_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[0][i] = v[0]; + out[1][i] = v[1]; + } +} + +void PQCLEAN_MCELIECE6688128F_VEC_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6688128F_VEC_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} diff --git a/crypto_kem/mceliece6688128f/vec/util.h b/crypto_kem/mceliece6688128f/vec/util.h new file mode 100644 index 00000000..f7f5b4ed --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/util.h @@ -0,0 +1,27 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_UTIL_H +#define PQCLEAN_MCELIECE6688128F_VEC_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE6688128F_VEC_store_i(unsigned char *out, uint64_t in, int i); + +void PQCLEAN_MCELIECE6688128F_VEC_store2(unsigned char *dest, uint16_t a); + +uint16_t PQCLEAN_MCELIECE6688128F_VEC_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE6688128F_VEC_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE6688128F_VEC_irr_load(vec out[][GFBITS], const unsigned char *in); + +void PQCLEAN_MCELIECE6688128F_VEC_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE6688128F_VEC_load8(const unsigned char *in); + +#endif + diff --git a/crypto_kem/mceliece6688128f/vec/vec.c b/crypto_kem/mceliece6688128f/vec/vec.c new file mode 100644 index 00000000..93163263 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/vec.c @@ -0,0 +1,139 @@ +#include "vec.h" + +#include "params.h" + +vec PQCLEAN_MCELIECE6688128F_VEC_vec_setbits(vec b) { + vec ret = -b; + + return ret; +} + +vec PQCLEAN_MCELIECE6688128F_VEC_vec_set1_16b(uint16_t v) { + vec ret; + + ret = v; + ret |= ret << 16; + ret |= ret << 32; + + return ret; +} + +void PQCLEAN_MCELIECE6688128F_VEC_vec_copy(vec *out, const vec *in) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i] = in[i]; + } +} + +vec PQCLEAN_MCELIECE6688128F_VEC_vec_or_reduce(const vec *a) { + int i; + vec ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret |= a[i]; + } + + return ret; +} + +int PQCLEAN_MCELIECE6688128F_VEC_vec_testz(vec a) { + a |= a >> 32; + a |= a >> 16; + a |= a >> 8; + a |= a >> 4; + a |= a >> 2; + a |= a >> 1; + + return ((int)a & 1) ^ 1; +} + + +void PQCLEAN_MCELIECE6688128F_VEC_vec_mul(vec *h, const vec *f, const vec *g) { + int i, j; + vec buf[ 2 * GFBITS - 1 ]; + + for (i = 0; i < 2 * GFBITS - 1; i++) { + buf[i] = 0; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < GFBITS; j++) { + buf[i + j] ^= f[i] & g[j]; + } + } + + for (i = 2 * GFBITS - 2; i >= GFBITS; i--) { + buf[i - GFBITS + 4] ^= buf[i]; + buf[i - GFBITS + 3] ^= buf[i]; + buf[i - GFBITS + 1] ^= buf[i]; + buf[i - GFBITS + 0] ^= buf[i]; + } + + for (i = 0; i < GFBITS; i++) { + h[i] = buf[i]; + } +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE6688128F_VEC_vec_sq(vec *out, const vec *in) { + int i; + vec result[GFBITS], t; + + t = in[11] ^ in[12]; + + result[0] = in[0] ^ in[11]; + result[1] = in[7] ^ t; + result[2] = in[1] ^ in[7]; + result[3] = in[8] ^ t; + result[4] = in[2] ^ in[7]; + result[4] = result[4] ^ in[8]; + result[4] = result[4] ^ t; + result[5] = in[7] ^ in[9]; + result[6] = in[3] ^ in[8]; + result[6] = result[6] ^ in[9]; + result[6] = result[6] ^ in[12]; + result[7] = in[8] ^ in[10]; + result[8] = in[4] ^ in[9]; + result[8] = result[8] ^ in[10]; + result[9] = in[9] ^ in[11]; + result[10] = in[5] ^ in[10]; + result[10] = result[10] ^ in[11]; + result[11] = in[10] ^ in[12]; + result[12] = in[6] ^ t; + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE6688128F_VEC_vec_inv(vec *out, const vec *in) { + vec tmp_11[ GFBITS ]; + vec tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE6688128F_VEC_vec_copy(out, in); + + PQCLEAN_MCELIECE6688128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE6688128F_VEC_vec_sq(out, tmp_11); + PQCLEAN_MCELIECE6688128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE6688128F_VEC_vec_sq(out, tmp_1111); + PQCLEAN_MCELIECE6688128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE6688128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6688128F_VEC_vec_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE6688128F_VEC_vec_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece6688128f/vec/vec.h b/crypto_kem/mceliece6688128f/vec/vec.h new file mode 100644 index 00000000..1488a437 --- /dev/null +++ b/crypto_kem/mceliece6688128f/vec/vec.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE6688128F_VEC_VEC_H +#define PQCLEAN_MCELIECE6688128F_VEC_VEC_H + +#include "params.h" + +#include + +typedef uint64_t vec; + +vec PQCLEAN_MCELIECE6688128F_VEC_vec_setbits(vec b); + +vec PQCLEAN_MCELIECE6688128F_VEC_vec_set1_16b(uint16_t v); + +void PQCLEAN_MCELIECE6688128F_VEC_vec_copy(vec *out, const vec *in); + +vec PQCLEAN_MCELIECE6688128F_VEC_vec_or_reduce(const vec *a); + +int PQCLEAN_MCELIECE6688128F_VEC_vec_testz(vec a); + +void PQCLEAN_MCELIECE6688128F_VEC_vec_mul(vec * /*h*/, const vec * /*f*/, const vec * /*g*/); +void PQCLEAN_MCELIECE6688128F_VEC_vec_sq(vec * /*out*/, const vec * /*in*/); +void PQCLEAN_MCELIECE6688128F_VEC_vec_inv(vec * /*out*/, const vec * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/META.yml b/crypto_kem/mceliece6960119/META.yml new file mode 100644 index 00000000..3d4e37bb --- /dev/null +++ b/crypto_kem/mceliece6960119/META.yml @@ -0,0 +1,48 @@ +name: Classic McEliece 6960119 +type: kem +claimed-nist-level: 5 +claimed-security: IND-CCA2 +length-public-key: 1047319 +length-secret-key: 13908 +length-ciphertext: 226 +length-shared-secret: 32 +nistkat-sha256: 653ada51f795f7c606a6316f6c6db50f18804fe4a07aa26c78dc8f4ae2f9bccd +principal-submitters: + - Daniel J. Bernstein + - Tung Chou + - Tanja Lange + - Ingo von Maurich + - Rafael Misoczki + - Ruben Niederhagen + - Edoardo Persichetti + - Christiane Peters + - Peter Schwabe + - Nicolas Sendrier + - Jakub Szefer + - Wen Wang +auxiliary-submitters: [] +implementations: + - name: clean + version: SUPERCOP-20191221 + - name: vec + version: SUPERCOP-20191221 + - name: sse + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - sse4_1 + - popcnt + - name: avx + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - popcnt diff --git a/crypto_kem/mceliece6960119/avx/LICENSE b/crypto_kem/mceliece6960119/avx/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece6960119/avx/Makefile b/crypto_kem/mceliece6960119/avx/Makefile new file mode 100644 index 00000000..9b6b8037 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/Makefile @@ -0,0 +1,44 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece6960119_avx.a + + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c int32_sort.c operations.c pk_gen.c sk_gen.c \ + transpose.c uint32_sort.c util.c vec128.c vec256.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S \ + transpose_64x256_sp_asm.S update_asm.S vec128_mul_asm.S \ + vec256_ama_asm.S vec256_maa_asm.S vec256_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h int32_sort.h \ + params.h pk_gen.h sk_gen.h transpose.h uint32_sort.h util.h vec128.h \ + vec256.h \ + consts.inc scalars_2x.inc scalars_4x.inc + + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o int32_sort.o operations.o pk_gen.o sk_gen.o \ + transpose.o uint32_sort.o util.o vec128.o vec256.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + transpose_64x256_sp_asm.o update_asm.o vec128_mul_asm.o \ + vec256_ama_asm.o vec256_maa_asm.o vec256_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mpopcnt -mavx2 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece6960119/avx/aes256ctr.c b/crypto_kem/mceliece6960119/avx/aes256ctr.c new file mode 100644 index 00000000..971c665d --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE6960119_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece6960119/avx/aes256ctr.h b/crypto_kem/mceliece6960119/avx/aes256ctr.h new file mode 100644 index 00000000..5e9bddf9 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_AES256CTR_H +#define PQCLEAN_MCELIECE6960119_AVX_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6960119_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6960119/avx/api.h b/crypto_kem/mceliece6960119/avx/api.h new file mode 100644 index 00000000..ddae8a93 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_API_H +#define PQCLEAN_MCELIECE6960119_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE6960119_AVX_CRYPTO_ALGNAME "Classic McEliece 6960119" +#define PQCLEAN_MCELIECE6960119_AVX_CRYPTO_PUBLICKEYBYTES 1047319 +#define PQCLEAN_MCELIECE6960119_AVX_CRYPTO_SECRETKEYBYTES 13908 +#define PQCLEAN_MCELIECE6960119_AVX_CRYPTO_CIPHERTEXTBYTES 226 +#define PQCLEAN_MCELIECE6960119_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece6960119/avx/benes.c b/crypto_kem/mceliece6960119/avx/benes.c new file mode 100644 index 00000000..4d859b10 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE6960119_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE6960119_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(PQCLEAN_MCELIECE6960119_AVX_load8(ptr), PQCLEAN_MCELIECE6960119_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE6960119_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(PQCLEAN_MCELIECE6960119_AVX_load8(ptr), PQCLEAN_MCELIECE6960119_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6960119_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece6960119/avx/benes.h b/crypto_kem/mceliece6960119/avx/benes.h new file mode 100644 index 00000000..0120309c --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_BENES_H +#define PQCLEAN_MCELIECE6960119_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE6960119_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE6960119_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/avx/bm.c b/crypto_kem/mceliece6960119/avx/bm.c new file mode 100644 index 00000000..f80259ab --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/bm.c @@ -0,0 +1,210 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE6960119_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE6960119_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE6960119_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE6960119_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6960119_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE6960119_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE6960119_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE6960119_AVX_vec256_or(PQCLEAN_MCELIECE6960119_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE6960119_AVX_vec256_sll_4x(PQCLEAN_MCELIECE6960119_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE6960119_AVX_vec256_or(PQCLEAN_MCELIECE6960119_AVX_vec256_srl_4x(PQCLEAN_MCELIECE6960119_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE6960119_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6960119_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + uint64_t v[2]; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + vec128 db[ GFBITS ][ 2 ]; + vec128 BC_tmp[ GFBITS ][ 2 ]; + vec128 BC[ GFBITS ][ 2 ]; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC[0][0] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0, one << 62); + BC[0][1] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0, one << 63); + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = PQCLEAN_MCELIECE6960119_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_setzero(); + } + + for (N = 0; N < SYS_T * 2; N++) { + PQCLEAN_MCELIECE6960119_AVX_update_asm(interval, coefs[N], 16); + PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm(prod, interval, BC[0] + 1, 32); + + d = PQCLEAN_MCELIECE6960119_AVX_vec_reduce_asm(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = PQCLEAN_MCELIECE6960119_AVX_vec128_setbits((d >> i) & 1); + db[i][1] = PQCLEAN_MCELIECE6960119_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_mul((vec256 *) BC_tmp, (vec256 *) db, (vec256 *) BC); + + vec128_cmov(BC, mask); + PQCLEAN_MCELIECE6960119_AVX_update_asm(BC, 0, 32); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(BC_tmp[i][0], BC_tmp[i][1]); + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(BC[i][1], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(BC[i][1], 1); + + out[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x((v[0] >> 8) | (v[1] << 56), v[1] >> 8); + } +} + diff --git a/crypto_kem/mceliece6960119/avx/bm.h b/crypto_kem/mceliece6960119/avx/bm.h new file mode 100644 index 00000000..b3205008 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_BM_H +#define PQCLEAN_MCELIECE6960119_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6960119_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/avx/consts.S b/crypto_kem/mceliece6960119/avx/consts.S new file mode 100644 index 00000000..e4abaf61 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE6960119_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE6960119_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE6960119_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE6960119_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE6960119_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE6960119_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE6960119_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE6960119_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE6960119_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE6960119_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE6960119_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE6960119_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE6960119_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece6960119/avx/consts.inc b/crypto_kem/mceliece6960119/avx/consts.inc new file mode 100644 index 00000000..1f8f716c --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/crypto_kem/mceliece6960119/avx/controlbits.c b/crypto_kem/mceliece6960119/avx/controlbits.c new file mode 100644 index 00000000..da9d0193 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6960119_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6960119_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6960119_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6960119_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6960119/avx/controlbits.h b/crypto_kem/mceliece6960119/avx/controlbits.h new file mode 100644 index 00000000..8e184b4d --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE6960119_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6960119_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6960119_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6960119/avx/crypto_hash.h b/crypto_kem/mceliece6960119/avx/crypto_hash.h new file mode 100644 index 00000000..222688ee --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6960119_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6960119/avx/decrypt.c b/crypto_kem/mceliece6960119/avx/decrypt.c new file mode 100644 index 00000000..5bcd7e56 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/decrypt.c @@ -0,0 +1,236 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE6960119_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6960119_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6960119_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + r[i - 1] &= (1 << ((GFBITS * SYS_T) % 8)) - 1; // throwing away redundant bits + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE6960119_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE6960119_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE6960119_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6960119_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6960119_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6960119_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec256_or(diff, PQCLEAN_MCELIECE6960119_AVX_vec256_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE6960119_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6960119_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 32 ][ GFBITS ]; + vec256 scaled[ 32 ][ GFBITS ]; + vec256 eval[32][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE6960119_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE6960119_AVX_benes(recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE6960119_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6960119_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE6960119_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE6960119_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE6960119_AVX_benes(error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece6960119/avx/decrypt.h b/crypto_kem/mceliece6960119/avx/decrypt.h new file mode 100644 index 00000000..c3dc8444 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE6960119_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6960119_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/avx/encrypt.c b/crypto_kem/mceliece6960119/avx/encrypt.c new file mode 100644 index 00000000..aa63109a --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/encrypt.c @@ -0,0 +1,104 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE6960119_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE6960119_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6960119_AVX_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6960119_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + PQCLEAN_MCELIECE6960119_AVX_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece6960119/avx/encrypt.h b/crypto_kem/mceliece6960119/avx/encrypt.h new file mode 100644 index 00000000..5bdf635f --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE6960119_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6960119_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/avx/fft.c b/crypto_kem/mceliece6960119/avx/fft.c new file mode 100644 index 00000000..59ba1717 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/fft.c @@ -0,0 +1,262 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE6960119_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE6960119_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6960119_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE6960119_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + // transpose + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6960119_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece6960119/avx/fft.h b/crypto_kem/mceliece6960119/avx/fft.h new file mode 100644 index 00000000..27f69741 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_FFT_H +#define PQCLEAN_MCELIECE6960119_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE6960119_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/avx/fft_tr.c b/crypto_kem/mceliece6960119/avx/fft_tr.c new file mode 100644 index 00000000..b7fc2439 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/fft_tr.c @@ -0,0 +1,400 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE6960119_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE6960119_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +/* justifying the length of the output */ +static void postprocess(vec256 *out) { + int i; + uint64_t v[4]; + + for (i = 0; i < 13; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(out[i], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(out[i], 1); + v[2] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(out[i], 2); + v[3] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(out[i], 3); + + v[3] <<= (128 - SYS_T) * 2; + v[3] >>= (128 - SYS_T) * 2; + + out[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + + +void PQCLEAN_MCELIECE6960119_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/crypto_kem/mceliece6960119/avx/fft_tr.h b/crypto_kem/mceliece6960119/avx/fft_tr.h new file mode 100644 index 00000000..091f30c4 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE6960119_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6960119_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6960119/avx/gf.c b/crypto_kem/mceliece6960119/avx/gf.c new file mode 100644 index 00000000..14a5d2bd --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/gf.c @@ -0,0 +1,203 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6960119_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE6960119_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6960119_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6960119_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE6960119_AVX_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6960119_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[237]; + + for (i = 0; i < 237; i++) { + prod[i] = 0; + } + + for (i = 0; i < 119; i++) { + for (j = 0; j < 119; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6960119_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 236; i >= 119; i--) { + prod[i - 117] ^= PQCLEAN_MCELIECE6960119_AVX_gf_mul(prod[i], (gf) 6400); + prod[i - 119] ^= PQCLEAN_MCELIECE6960119_AVX_gf_mul(prod[i], (gf) 3134); + } + + for (i = 0; i < 119; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6960119/avx/gf.h b/crypto_kem/mceliece6960119/avx/gf.h new file mode 100644 index 00000000..543e55ef --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_GF_H +#define PQCLEAN_MCELIECE6960119_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6960119_AVX_gf_iszero(gf a); +gf PQCLEAN_MCELIECE6960119_AVX_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE6960119_AVX_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE6960119_AVX_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE6960119_AVX_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE6960119_AVX_gf_inv(gf in); + +void PQCLEAN_MCELIECE6960119_AVX_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece6960119/avx/int32_sort.c b/crypto_kem/mceliece6960119/avx/int32_sort.c new file mode 100644 index 00000000..30ffeb17 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/int32_sort.c @@ -0,0 +1,1208 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = (p << 1 == q); + flipflip = !flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE6960119_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE6960119_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/crypto_kem/mceliece6960119/avx/int32_sort.h b/crypto_kem/mceliece6960119/avx/int32_sort.h new file mode 100644 index 00000000..1bb75c3c --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE6960119_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE6960119_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece6960119/avx/operations.c b/crypto_kem/mceliece6960119/avx/operations.c new file mode 100644 index 00000000..68577b49 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6960119_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6960119_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6960119_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6960119_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6960119_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6960119_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6960119_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6960119_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6960119_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6960119_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119/avx/params.h b/crypto_kem/mceliece6960119/avx/params.h new file mode 100644 index 00000000..e50eff28 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_PARAMS_H +#define PQCLEAN_MCELIECE6960119_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6960 +#define SYS_T 119 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6960119/avx/pk_gen.c b/crypto_kem/mceliece6960119/avx/pk_gen.c new file mode 100644 index 00000000..df0b63ad --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/pk_gen.c @@ -0,0 +1,292 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256) +int PQCLEAN_MCELIECE6960119_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS1_I - 1; + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS1_I ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE6960119_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6960119_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE6960119_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6960119_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_I; j++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS1_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + uint64_t column[ PK_NROWS ]; + + for (i = 0; i < PK_NROWS; i++) { + column[i] = mat[ i ][ block_idx ]; + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + + for (i = 0; i < PK_NROWS; i++) { + mat[ i ][ block_idx ] = column[i]; + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = 0; k < NBLOCKS1_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + one_row[k] = (one_row[k] >> tail) | (one_row[k + 1] << (64 - tail)); + PQCLEAN_MCELIECE6960119_AVX_store8(pk, one_row[k]); + pk += 8; + } + + one_row[k] >>= tail; + PQCLEAN_MCELIECE6960119_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk[ (PK_ROW_BYTES % 8) - 1 ] &= (1 << (PK_NCOLS % 8)) - 1; // removing redundant bits + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece6960119/avx/pk_gen.h b/crypto_kem/mceliece6960119/avx/pk_gen.h new file mode 100644 index 00000000..dc4baa39 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE6960119_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6960119_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/avx/scalars_2x.inc b/crypto_kem/mceliece6960119/avx/scalars_2x.inc new file mode 100644 index 00000000..69496ccf --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece6960119/avx/scalars_4x.inc b/crypto_kem/mceliece6960119/avx/scalars_4x.inc new file mode 100644 index 00000000..998eb2f1 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/crypto_kem/mceliece6960119/avx/sk_gen.c b/crypto_kem/mceliece6960119/avx/sk_gen.c new file mode 100644 index 00000000..ebc5aa4e --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6960119_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6960119_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6960119_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6960119_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6960119_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6960119_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6960119_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6960119_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119/avx/sk_gen.h b/crypto_kem/mceliece6960119/avx/sk_gen.h new file mode 100644 index 00000000..6a45a26f --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE6960119_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6960119_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6960119_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/avx/syndrome_asm.S b/crypto_kem/mceliece6960119/avx/syndrome_asm.S new file mode 100644 index 00000000..26b117a9 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/syndrome_asm.S @@ -0,0 +1,921 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 b0 + +# qhasm: int64 b1 + +# qhasm: int64 i + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: int64 tmp + +# qhasm: stack64 back + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119_AVX_syndrome_asm +.global PQCLEAN_MCELIECE6960119_AVX_syndrome_asm +_PQCLEAN_MCELIECE6960119_AVX_syndrome_asm: +PQCLEAN_MCELIECE6960119_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $64,%r11 +sub %r11,%rsp + +# qhasm: input_2 += 193 +# asm 1: add $193,tmp=int64#4 +# asm 2: movzbq 0(tmp=%rcx +movzbq 0(%rdx),%rcx + +# qhasm: back = tmp +# asm 1: movq back=stack64#1 +# asm 2: movq back=32(%rsp) +movq %rcx,32(%rsp) + +# qhasm: i = 0 +# asm 1: mov $0,>i=int64#4 +# asm 2: mov $0,>i=%rcx +mov $0,%rcx + +# qhasm: inner1: +._inner1: + +# qhasm: addr = input_2 + i +# asm 1: lea (addr=int64#5 +# asm 2: lea (addr=%r8 +lea (%rdx,%rcx),%r8 + +# qhasm: b0 = *(uint8 *) (addr + 0) +# asm 1: movzbq 0(b0=int64#6 +# asm 2: movzbq 0(b0=%r9 +movzbq 0(%r8),%r9 + +# qhasm: b1 = *(uint8 *) (addr + 1) +# asm 1: movzbq 1(b1=int64#7 +# asm 2: movzbq 1(b1=%rax +movzbq 1(%r8),%rax + +# qhasm: (uint64) b0 >>= 3 +# asm 1: shr $3,b0=int64#4 +# asm 2: movzbq 1(b0=%rcx +movzbq 1(%r8),%rcx + +# qhasm: (uint64) b0 >>= 3 +# asm 1: shr $3,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1547 +# asm 1: mov $1547,>row=int64#5 +# asm 2: mov $1547,>row=%r8 +mov $1547,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#3 +# asm 2: vmovupd 32(ee=%ymm2 +vmovupd 32(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#3 +# asm 2: vmovupd 64(ee=%ymm2 +vmovupd 64(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#3 +# asm 2: vmovupd 96(ee=%ymm2 +vmovupd 96(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#3 +# asm 2: vmovupd 128(ee=%ymm2 +vmovupd 128(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#3 +# asm 2: vmovupd 160(ee=%ymm2 +vmovupd 160(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 192 ] +# asm 1: vmovupd 192(ee=reg256#3 +# asm 2: vmovupd 192(ee=%ymm2 +vmovupd 192(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 224 ] +# asm 1: vmovupd 224(ee=reg256#3 +# asm 2: vmovupd 224(ee=%ymm2 +vmovupd 224(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 256 ] +# asm 1: vmovupd 256(ee=reg256#3 +# asm 2: vmovupd 256(ee=%ymm2 +vmovupd 256(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 288 ] +# asm 1: vmovupd 288(ee=reg256#3 +# asm 2: vmovupd 288(ee=%ymm2 +vmovupd 288(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 320 ] +# asm 1: vmovupd 320(ee=reg256#3 +# asm 2: vmovupd 320(ee=%ymm2 +vmovupd 320(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 352 ] +# asm 1: vmovupd 352(ee=reg256#3 +# asm 2: vmovupd 352(ee=%ymm2 +vmovupd 352(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 384 ] +# asm 1: vmovupd 384(ee=reg256#3 +# asm 2: vmovupd 384(ee=%ymm2 +vmovupd 384(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 416(pp=%ymm1 +vmovupd 416(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 416 ] +# asm 1: vmovupd 416(ee=reg256#3 +# asm 2: vmovupd 416(ee=%ymm2 +vmovupd 416(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 448(pp=%ymm1 +vmovupd 448(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 448 ] +# asm 1: vmovupd 448(ee=reg256#3 +# asm 2: vmovupd 448(ee=%ymm2 +vmovupd 448(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 480(pp=%ymm1 +vmovupd 480(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 480 ] +# asm 1: vmovupd 480(ee=reg256#3 +# asm 2: vmovupd 480(ee=%ymm2 +vmovupd 480(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 512(pp=%ymm1 +vmovupd 512(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 512 ] +# asm 1: vmovupd 512(ee=reg256#3 +# asm 2: vmovupd 512(ee=%ymm2 +vmovupd 512(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 544(pp=%ymm1 +vmovupd 544(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 544 ] +# asm 1: vmovupd 544(ee=reg256#3 +# asm 2: vmovupd 544(ee=%ymm2 +vmovupd 544(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 576(pp=%ymm1 +vmovupd 576(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 576 ] +# asm 1: vmovupd 576(ee=reg256#3 +# asm 2: vmovupd 576(ee=%ymm2 +vmovupd 576(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 608(pp=%ymm1 +vmovupd 608(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 608 ] +# asm 1: vmovupd 608(ee=reg256#3 +# asm 2: vmovupd 608(ee=%ymm2 +vmovupd 608(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 640(pp=%ymm1 +vmovupd 640(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 640 ] +# asm 1: vmovupd 640(ee=reg256#3 +# asm 2: vmovupd 640(ee=%ymm2 +vmovupd 640(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = *(uint32 *) (input_1 + 672) +# asm 1: movl 672(s=int64#6d +# asm 2: movl 672(s=%r9d +movl 672(%rsi),%r9d + +# qhasm: e = *(uint32 *) (input_2 + 672) +# asm 1: movl 672(e=int64#7d +# asm 2: movl 672(e=%eax +movl 672(%rdx),%eax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movzbq 676(p=%rax +movzbq 676(%rsi),%rax + +# qhasm: e = *(uint8 *) (input_2 + 676) +# asm 1: movzbq 676(e=int64#8 +# asm 2: movzbq 676(e=%r10 +movzbq 676(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,i=int64#2 +# asm 2: mov $676,>i=%rsi +mov $676,%rsi + +# qhasm: inner2: +._inner2: + +# qhasm: i -= 1 +# asm 1: sub $1,addr=int64#4 +# asm 2: lea (addr=%rcx +lea (%rdx,%rsi),%rcx + +# qhasm: b0 = *(uint8 *) (addr + 0) +# asm 1: movzbq 0(b0=int64#5 +# asm 2: movzbq 0(b0=%r8 +movzbq 0(%rcx),%r8 + +# qhasm: b1 = *(uint8 *) (addr + 1) +# asm 1: movzbq 1(b1=int64#6 +# asm 2: movzbq 1(b1=%r9 +movzbq 1(%rcx),%r9 + +# qhasm: (uint64) b0 >>= 5 +# asm 1: shr $5,tmp=int64#2 +# asm 2: movq tmp=%rsi +movq 32(%rsp),%rsi + +# qhasm: *(uint8 *) (input_2 + 0) = tmp +# asm 1: movb i=int64#2 +# asm 2: mov $0,>i=%rsi +mov $0,%rsi + +# qhasm: inner3: +._inner3: + +# qhasm: s = *(uint8 *) (input_0 + 0) +# asm 1: movzbq 0(s=int64#4 +# asm 2: movzbq 0(s=%rcx +movzbq 0(%rdi),%rcx + +# qhasm: e = *(uint8 *) (input_2 + 0) +# asm 1: movzbq 0(e=int64#5 +# asm 2: movzbq 0(e=%r8 +movzbq 0(%rdx),%r8 + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movzbq 0(s=%rsi +movzbq 0(%rdi),%rsi + +# qhasm: e = *(uint8 *) (input_2 + 0) +# asm 1: movzbq 0(e=int64#3 +# asm 2: movzbq 0(e=%rdx +movzbq 0(%rdx),%rdx + +# qhasm: (uint32) e &= 7 +# asm 1: and $7,mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE6960119_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece6960119/avx/update_asm.S b/crypto_kem/mceliece6960119/avx/update_asm.S new file mode 100644 index 00000000..9ec1b6e9 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119_AVX_update_asm +.global PQCLEAN_MCELIECE6960119_AVX_update_asm +_PQCLEAN_MCELIECE6960119_AVX_update_asm: +PQCLEAN_MCELIECE6960119_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE6960119_AVX_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6960119_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6960119_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6960119_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6960119_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6960119_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6960119_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6960119_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE6960119_AVX_vec128_set2x( PQCLEAN_MCELIECE6960119_AVX_load8(in), PQCLEAN_MCELIECE6960119_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE6960119_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE6960119_AVX_store8(out + 0, PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE6960119_AVX_store8(out + 8, PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece6960119/avx/util.h b/crypto_kem/mceliece6960119/avx/util.h new file mode 100644 index 00000000..37480980 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_UTIL_H +#define PQCLEAN_MCELIECE6960119_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE6960119_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE6960119_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE6960119_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE6960119_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE6960119_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE6960119_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE6960119_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE6960119_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE6960119_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece6960119/avx/vec128.c b/crypto_kem/mceliece6960119/avx/vec128.c new file mode 100644 index 00000000..2aca3544 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE6960119_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE6960119_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE6960119_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/crypto_kem/mceliece6960119/avx/vec128.h b/crypto_kem/mceliece6960119/avx/vec128.h new file mode 100644 index 00000000..479f20da --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_VEC128_H +#define PQCLEAN_MCELIECE6960119_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE6960119_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE6960119_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE6960119_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE6960119_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/crypto_kem/mceliece6960119/avx/vec128_mul_asm.S b/crypto_kem/mceliece6960119/avx/vec128_mul_asm.S new file mode 100644 index 00000000..a2f9e198 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE6960119_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE6960119_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE6960119_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE6960119_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE6960119_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE6960119_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/crypto_kem/mceliece6960119/avx/vec256_ama_asm.S b/crypto_kem/mceliece6960119/avx/vec256_ama_asm.S new file mode 100644 index 00000000..966a9539 --- /dev/null +++ b/crypto_kem/mceliece6960119/avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6960119_CLEAN_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6960119/clean/api.h b/crypto_kem/mceliece6960119/clean/api.h new file mode 100644 index 00000000..60071328 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_API_H +#define PQCLEAN_MCELIECE6960119_CLEAN_API_H + +#include + +#define PQCLEAN_MCELIECE6960119_CLEAN_CRYPTO_ALGNAME "Classic McEliece 6960119" +#define PQCLEAN_MCELIECE6960119_CLEAN_CRYPTO_PUBLICKEYBYTES 1047319 +#define PQCLEAN_MCELIECE6960119_CLEAN_CRYPTO_SECRETKEYBYTES 13908 +#define PQCLEAN_MCELIECE6960119_CLEAN_CRYPTO_CIPHERTEXTBYTES 226 +#define PQCLEAN_MCELIECE6960119_CLEAN_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6960119_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6960119_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6960119_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece6960119/clean/benes.c b/crypto_kem/mceliece6960119/clean/benes.c new file mode 100644 index 00000000..eaacd7bb --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/benes.c @@ -0,0 +1,180 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6960119_CLEAN_apply_benes(unsigned char *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + unsigned char *r_ptr = r; + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = PQCLEAN_MCELIECE6960119_CLEAN_load8(r_ptr + i * 16 + 0); + r_int_v[1][i] = PQCLEAN_MCELIECE6960119_CLEAN_load8(r_ptr + i * 16 + 8); + } + + PQCLEAN_MCELIECE6960119_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6960119_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6960119_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6960119_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6960119_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE6960119_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6960119_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6960119_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6960119_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6960119_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE6960119_CLEAN_store8(r_ptr + i * 16 + 0, r_int_v[0][i]); + PQCLEAN_MCELIECE6960119_CLEAN_store8(r_ptr + i * 16 + 8, r_int_v[1][i]); + } +} + +/* input: condition bits c */ +/* output: support s */ +void PQCLEAN_MCELIECE6960119_CLEAN_support_gen(gf *s, const unsigned char *c) { + gf a; + int i, j; + unsigned char L[ GFBITS ][ (1 << GFBITS) / 8 ]; + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < (1 << GFBITS) / 8; j++) { + L[i][j] = 0; + } + } + + for (i = 0; i < (1 << GFBITS); i++) { + a = PQCLEAN_MCELIECE6960119_CLEAN_bitrev((gf) i); + + for (j = 0; j < GFBITS; j++) { + L[j][ i / 8 ] |= ((a >> j) & 1) << (i % 8); + } + } + + for (j = 0; j < GFBITS; j++) { + PQCLEAN_MCELIECE6960119_CLEAN_apply_benes(L[j], c, 0); + } + + for (i = 0; i < SYS_N; i++) { + s[i] = 0; + for (j = GFBITS - 1; j >= 0; j--) { + s[i] <<= 1; + s[i] |= (L[j][i / 8] >> (i % 8)) & 1; + } + } +} + diff --git a/crypto_kem/mceliece6960119/clean/benes.h b/crypto_kem/mceliece6960119/clean/benes.h new file mode 100644 index 00000000..cbd132b9 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/benes.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_BENES_H +#define PQCLEAN_MCELIECE6960119_CLEAN_BENES_H +/* + This file is for Benes network related functions +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE6960119_CLEAN_apply_benes(uint8_t *r, const uint8_t *bits, int rev); +void PQCLEAN_MCELIECE6960119_CLEAN_support_gen(gf *s, const uint8_t *c); + +#endif + diff --git a/crypto_kem/mceliece6960119/clean/bm.c b/crypto_kem/mceliece6960119/clean/bm.c new file mode 100644 index 00000000..8c9d2f14 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/bm.c @@ -0,0 +1,83 @@ +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ +#include "bm.h" + +#include "params.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +/* the Berlekamp-Massey algorithm */ +/* input: s, sequence of field elements */ +/* output: out, minimal polynomial of s */ +void PQCLEAN_MCELIECE6960119_CLEAN_bm(gf *out, gf *s) { + int i; + + uint16_t N = 0; + uint16_t L = 0; + uint16_t mle; + uint16_t mne; + + gf T[ SYS_T + 1 ]; + gf C[ SYS_T + 1 ]; + gf B[ SYS_T + 1 ]; + + gf b = 1, d, f; + + // + + for (i = 0; i < SYS_T + 1; i++) { + C[i] = B[i] = 0; + } + + B[1] = C[0] = 1; + + // + + for (N = 0; N < 2 * SYS_T; N++) { + d = 0; + + for (i = 0; i <= min(N, SYS_T); i++) { + d ^= PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(C[i], s[ N - i]); + } + + mne = d; + mne -= 1; + mne >>= 15; + mne -= 1; + mle = N; + mle -= 2 * L; + mle >>= 15; + mle -= 1; + mle &= mne; + + for (i = 0; i <= SYS_T; i++) { + T[i] = C[i]; + } + + f = PQCLEAN_MCELIECE6960119_CLEAN_gf_frac(b, d); + + for (i = 0; i <= SYS_T; i++) { + C[i] ^= PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(f, B[i]) & mne; + } + + L = (L & ~mle) | ((N + 1 - L) & mle); + + for (i = 0; i <= SYS_T; i++) { + B[i] = (B[i] & ~mle) | (T[i] & mle); + } + + b = (b & ~mle) | (d & mle); + + for (i = SYS_T; i >= 1; i--) { + B[i] = B[i - 1]; + } + B[0] = 0; + } + + for (i = 0; i <= SYS_T; i++) { + out[i] = C[ SYS_T - i ]; + } +} + diff --git a/crypto_kem/mceliece6960119/clean/bm.h b/crypto_kem/mceliece6960119/clean/bm.h new file mode 100644 index 00000000..ee121af8 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/bm.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_BM_H +#define PQCLEAN_MCELIECE6960119_CLEAN_BM_H +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE6960119_CLEAN_bm(gf * /*out*/, gf * /*s*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/clean/controlbits.c b/crypto_kem/mceliece6960119/clean/controlbits.c new file mode 100644 index 00000000..9a11b161 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6960119_CLEAN_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6960119_CLEAN_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6960119_CLEAN_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6960119_CLEAN_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6960119/clean/controlbits.h b/crypto_kem/mceliece6960119/clean/controlbits.h new file mode 100644 index 00000000..c2ee7ab4 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_CONTROLBITS_H +#define PQCLEAN_MCELIECE6960119_CLEAN_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6960119_CLEAN_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6960119_CLEAN_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6960119/clean/crypto_hash.h b/crypto_kem/mceliece6960119/clean/crypto_hash.h new file mode 100644 index 00000000..90a7672b --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6960119_CLEAN_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6960119/clean/decrypt.c b/crypto_kem/mceliece6960119/clean/decrypt.c new file mode 100644 index 00000000..189b9553 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/decrypt.c @@ -0,0 +1,92 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" +#include + +#include "benes.h" +#include "bm.h" +#include "gf.h" +#include "params.h" +#include "root.h" +#include "synd.h" +#include "util.h" + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6960119_CLEAN_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i, w = 0; + uint16_t check; + + unsigned char r[ SYS_N / 8 ]; + + gf g[ SYS_T + 1 ]; + gf L[ SYS_N ]; + + gf s[ SYS_T * 2 ]; + gf s_cmp[ SYS_T * 2 ]; + gf locator[ SYS_T + 1 ]; + gf images[ SYS_N ]; + + gf t; + + // + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = c[i]; + } + r[i - 1] &= (1 << ((GFBITS * SYS_T) % 8)) - 1; + for (i = SYND_BYTES; i < SYS_N / 8; i++) { + r[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE6960119_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + g[ SYS_T ] = 1; + + PQCLEAN_MCELIECE6960119_CLEAN_support_gen(L, sk); + + PQCLEAN_MCELIECE6960119_CLEAN_synd(s, g, L, r); + + PQCLEAN_MCELIECE6960119_CLEAN_bm(locator, s); + + PQCLEAN_MCELIECE6960119_CLEAN_root(images, locator, L); + + // + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + } + + for (i = 0; i < SYS_N; i++) { + t = PQCLEAN_MCELIECE6960119_CLEAN_gf_iszero(images[i]) & 1; + + e[ i / 8 ] |= t << (i % 8); + w += t; + + } + + PQCLEAN_MCELIECE6960119_CLEAN_synd(s_cmp, g, L, e); + + // + + check = (uint16_t)w; + check ^= SYS_T; + + for (i = 0; i < SYS_T * 2; i++) { + check |= s[i] ^ s_cmp[i]; + } + + check -= 1; + check >>= 15; + + return check ^ 1; +} + diff --git a/crypto_kem/mceliece6960119/clean/decrypt.h b/crypto_kem/mceliece6960119/clean/decrypt.h new file mode 100644 index 00000000..7498262e --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_DECRYPT_H +#define PQCLEAN_MCELIECE6960119_CLEAN_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6960119_CLEAN_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/clean/encrypt.c b/crypto_kem/mceliece6960119/clean/encrypt.c new file mode 100644 index 00000000..caa24df6 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/encrypt.c @@ -0,0 +1,144 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +static inline uint8_t same_mask(uint16_t x, uint16_t y) { + uint32_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 31; + mask = -mask; + + return (uint8_t)mask; +} + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint8_t *ind_8 = (uint8_t *)ind_; + uint16_t ind[ SYS_T * 2 ]; + unsigned char mask; + unsigned char val[ SYS_T ]; + + while (1) { + randombytes(ind_8, sizeof(ind_)); + // Copy to uint16_t ind_ in a little-endian way + for (i = 0; i < sizeof(ind_); i += 2) { + ind_[i / 2] = ind_8[i + 1] << 8 | ind_8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = 1 << (ind[j] & 7); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = same_mask((uint16_t)i, ind[j] >> 3); + + e[i] |= val[j] & mask; + } + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + unsigned char b, row[SYS_N / 8]; + const unsigned char *pk_ptr = pk; + + int i, j, tail = PK_NROWS % 8; + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = 0; + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + row[j] = 0; + } + + for (j = 0; j < PK_ROW_BYTES; j++) { + row[ SYS_N / 8 - PK_ROW_BYTES + j ] = pk_ptr[j]; + } + + for (j = SYS_N / 8 - 1; j >= SYS_N / 8 - PK_ROW_BYTES; j--) { + row[ j ] = (row[ j ] << tail) | (row[j - 1] >> (8 - tail)); + } + + row[i / 8] |= 1 << (i % 8); + + b = 0; + for (j = 0; j < SYS_N / 8; j++) { + b ^= row[j] & e[j]; + } + + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] |= (b << (i % 8)); + + pk_ptr += PK_ROW_BYTES; + } +} + +void PQCLEAN_MCELIECE6960119_CLEAN_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece6960119/clean/encrypt.h b/crypto_kem/mceliece6960119/clean/encrypt.h new file mode 100644 index 00000000..df98bc82 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_ENCRYPT_H +#define PQCLEAN_MCELIECE6960119_CLEAN_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6960119_CLEAN_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/clean/gf.c b/crypto_kem/mceliece6960119/clean/gf.c new file mode 100644 index 00000000..cce8f277 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/gf.c @@ -0,0 +1,209 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE6960119_CLEAN_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE6960119_CLEAN_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* input: field element in */ +/* return: (in^2)^2 */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: (in^2)*m */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: ((in^2)^2)*m */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE6960119_CLEAN_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +gf PQCLEAN_MCELIECE6960119_CLEAN_gf_inv(gf in) { + return PQCLEAN_MCELIECE6960119_CLEAN_gf_frac(in, ((gf) 1)); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE6960119_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 2] ^= PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(prod[i], (gf) 6400); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(prod[i], (gf) 3134); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6960119/clean/gf.h b/crypto_kem/mceliece6960119/clean/gf.h new file mode 100644 index 00000000..758aa33b --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_GF_H +#define PQCLEAN_MCELIECE6960119_CLEAN_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6960119_CLEAN_gf_iszero(gf a); +gf PQCLEAN_MCELIECE6960119_CLEAN_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(gf in0, gf in1); +gf PQCLEAN_MCELIECE6960119_CLEAN_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE6960119_CLEAN_gf_inv(gf in); +uint64_t PQCLEAN_MCELIECE6960119_CLEAN_gf_mul2(gf a, gf b0, gf b1); + +void PQCLEAN_MCELIECE6960119_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece6960119/clean/operations.c b/crypto_kem/mceliece6960119/clean/operations.c new file mode 100644 index 00000000..b4fa7df0 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6960119_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6960119_CLEAN_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6960119_CLEAN_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6960119_CLEAN_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6960119_CLEAN_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6960119_CLEAN_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6960119_CLEAN_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6960119_CLEAN_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6960119_CLEAN_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6960119_CLEAN_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6960119_CLEAN_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119/clean/params.h b/crypto_kem/mceliece6960119/clean/params.h new file mode 100644 index 00000000..8d56ed61 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_PARAMS_H +#define PQCLEAN_MCELIECE6960119_CLEAN_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6960 +#define SYS_T 119 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6960119/clean/pk_gen.c b/crypto_kem/mceliece6960119/clean/pk_gen.c new file mode 100644 index 00000000..6d90d88f --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/pk_gen.c @@ -0,0 +1,154 @@ +/* + This file is for public-key generation +*/ + +#include +#include +#include +#include + +#include "controlbits.h" +#include "benes.h" +#include "params.h" +#include "pk_gen.h" +#include "root.h" +#include "util.h" + +/* input: secret key sk */ +/* output: public key pk */ +int PQCLEAN_MCELIECE6960119_CLEAN_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) { + unsigned char *pk_ptr = pk; + + int i, j, k; + int row, c, tail; + + uint64_t buf[ 1 << GFBITS ]; + + unsigned char mat[ GFBITS * SYS_T ][ SYS_N / 8 ]; + unsigned char mask; + unsigned char b; + + gf g[ SYS_T + 1 ]; // Goppa polynomial + gf L[ SYS_N ]; // support + gf inv[ SYS_N ]; + + // + + g[ SYS_T ] = 1; + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE6960119_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + + for (i = 0; i < (1 << GFBITS); i++) { + buf[i] = perm[i]; + buf[i] <<= 31; + buf[i] |= i; + } + + PQCLEAN_MCELIECE6960119_CLEAN_sort_63b(1 << GFBITS, buf); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = buf[i] & GFMASK; + } + for (i = 0; i < SYS_N; i++) { + L[i] = PQCLEAN_MCELIECE6960119_CLEAN_bitrev((gf)perm[i]); + } + + // filling the matrix + + PQCLEAN_MCELIECE6960119_CLEAN_root(inv, g, L); + + for (i = 0; i < SYS_N; i++) { + inv[i] = PQCLEAN_MCELIECE6960119_CLEAN_gf_inv(inv[i]); + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + mat[i][j] = 0; + } + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_N; j += 8) { + for (k = 0; k < GFBITS; k++) { + b = (inv[j + 7] >> k) & 1; + b <<= 1; + b |= (inv[j + 6] >> k) & 1; + b <<= 1; + b |= (inv[j + 5] >> k) & 1; + b <<= 1; + b |= (inv[j + 4] >> k) & 1; + b <<= 1; + b |= (inv[j + 3] >> k) & 1; + b <<= 1; + b |= (inv[j + 2] >> k) & 1; + b <<= 1; + b |= (inv[j + 1] >> k) & 1; + b <<= 1; + b |= (inv[j + 0] >> k) & 1; + + mat[ i * GFBITS + k ][ j / 8 ] = b; + } + } + + for (j = 0; j < SYS_N; j++) { + inv[j] = PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(inv[j], L[j]); + } + + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) { + for (j = 0; j < 8; j++) { + row = i * 8 + j; + + if (row >= GFBITS * SYS_T) { + break; + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] ^ mat[ k ][ i ]; + mask >>= j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < GFBITS * SYS_T; k++) { + if (k != row) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + } + + tail = (GFBITS * SYS_T) % 8; + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = (GFBITS * SYS_T - 1) / 8; j < SYS_N / 8 - 1; j++) { + *pk_ptr++ = (mat[i][j] >> tail) | (mat[i][j + 1] << (8 - tail)); + } + + *pk_ptr++ = (mat[i][j] >> tail); + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119/clean/pk_gen.h b/crypto_kem/mceliece6960119/clean/pk_gen.h new file mode 100644 index 00000000..977ac3d8 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_PK_GEN_H +#define PQCLEAN_MCELIECE6960119_CLEAN_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE6960119_CLEAN_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/clean/root.c b/crypto_kem/mceliece6960119/clean/root.c new file mode 100644 index 00000000..835bf1c8 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/root.c @@ -0,0 +1,33 @@ +/* + This file is for evaluating a polynomial at one or more field elements +*/ +#include "root.h" + +#include "params.h" + +/* input: polynomial f and field element a */ +/* return f(a) */ +gf PQCLEAN_MCELIECE6960119_CLEAN_eval(gf *f, gf a) { + int i; + gf r; + + r = f[ SYS_T ]; + + for (i = SYS_T - 1; i >= 0; i--) { + r = PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(r, a); + r = PQCLEAN_MCELIECE6960119_CLEAN_gf_add(r, f[i]); + } + + return r; +} + +/* input: polynomial f and list of field elements L */ +/* output: out = [ f(a) for a in L ] */ +void PQCLEAN_MCELIECE6960119_CLEAN_root(gf *out, gf *f, gf *L) { + int i; + + for (i = 0; i < SYS_N; i++) { + out[i] = PQCLEAN_MCELIECE6960119_CLEAN_eval(f, L[i]); + } +} + diff --git a/crypto_kem/mceliece6960119/clean/root.h b/crypto_kem/mceliece6960119/clean/root.h new file mode 100644 index 00000000..2d91d02d --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/root.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_ROOT_H +#define PQCLEAN_MCELIECE6960119_CLEAN_ROOT_H +/* + This file is for evaluating a polynomial at one or more field elements +*/ + + +#include "gf.h" + +gf PQCLEAN_MCELIECE6960119_CLEAN_eval(gf * /*f*/, gf /*a*/); +void PQCLEAN_MCELIECE6960119_CLEAN_root(gf * /*out*/, gf * /*f*/, gf * /*L*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/clean/sk_gen.c b/crypto_kem/mceliece6960119/clean/sk_gen.c new file mode 100644 index 00000000..4e3b76f2 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6960119_CLEAN_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6960119_CLEAN_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6960119_CLEAN_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6960119_CLEAN_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6960119_CLEAN_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6960119_CLEAN_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119/clean/sk_gen.h b/crypto_kem/mceliece6960119/clean/sk_gen.h new file mode 100644 index 00000000..a0b36f6e --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_SK_GEN_H +#define PQCLEAN_MCELIECE6960119_CLEAN_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6960119_CLEAN_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6960119_CLEAN_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/clean/synd.c b/crypto_kem/mceliece6960119/clean/synd.c new file mode 100644 index 00000000..62f79bfe --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/synd.c @@ -0,0 +1,33 @@ +/* + This file is for syndrome computation +*/ + +#include "synd.h" + +#include "params.h" +#include "root.h" + + +/* input: Goppa polynomial f, support L, received word r */ +/* output: out, the syndrome of length 2t */ +void PQCLEAN_MCELIECE6960119_CLEAN_synd(gf *out, gf *f, gf *L, const unsigned char *r) { + int i, j; + gf e, e_inv, c; + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = 0; + } + + for (i = 0; i < SYS_N; i++) { + c = (r[i / 8] >> (i % 8)) & 1; + + e = PQCLEAN_MCELIECE6960119_CLEAN_eval(f, L[i]); + e_inv = PQCLEAN_MCELIECE6960119_CLEAN_gf_inv(PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(e, e)); + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = PQCLEAN_MCELIECE6960119_CLEAN_gf_add(out[j], PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(e_inv, c)); + e_inv = PQCLEAN_MCELIECE6960119_CLEAN_gf_mul(e_inv, L[i]); + } + } +} + diff --git a/crypto_kem/mceliece6960119/clean/synd.h b/crypto_kem/mceliece6960119/clean/synd.h new file mode 100644 index 00000000..ab1aad51 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/synd.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_SYND_H +#define PQCLEAN_MCELIECE6960119_CLEAN_SYND_H +/* + This file is for syndrome computation +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE6960119_CLEAN_synd(gf * /*out*/, gf * /*f*/, gf * /*L*/, const unsigned char * /*r*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/clean/transpose.c b/crypto_kem/mceliece6960119/clean/transpose.c new file mode 100644 index 00000000..ea341057 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/transpose.c @@ -0,0 +1,42 @@ +/* + This file is for matrix transposition +*/ + +#include "transpose.h" + +#include + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE6960119_CLEAN_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + diff --git a/crypto_kem/mceliece6960119/clean/transpose.h b/crypto_kem/mceliece6960119/clean/transpose.h new file mode 100644 index 00000000..aa61bd84 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/transpose.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_TRANSPOSE_H +#define PQCLEAN_MCELIECE6960119_CLEAN_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + +void PQCLEAN_MCELIECE6960119_CLEAN_transpose_64x64(uint64_t * /*out*/, const uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/clean/util.c b/crypto_kem/mceliece6960119/clean/util.c new file mode 100644 index 00000000..af23b2b4 --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/util.c @@ -0,0 +1,67 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ + +#include "util.h" + +#include "params.h" + +void PQCLEAN_MCELIECE6960119_CLEAN_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6960119_CLEAN_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6960119_CLEAN_load4(const unsigned char *in) { + int i; + uint32_t ret = in[3]; + + for (i = 2; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +void PQCLEAN_MCELIECE6960119_CLEAN_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6960119_CLEAN_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE6960119_CLEAN_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 3; +} + diff --git a/crypto_kem/mceliece6960119/clean/util.h b/crypto_kem/mceliece6960119/clean/util.h new file mode 100644 index 00000000..d9cd66fe --- /dev/null +++ b/crypto_kem/mceliece6960119/clean/util.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6960119_CLEAN_UTIL_H +#define PQCLEAN_MCELIECE6960119_CLEAN_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include + +void PQCLEAN_MCELIECE6960119_CLEAN_store2(unsigned char * /*dest*/, gf /*a*/); +uint16_t PQCLEAN_MCELIECE6960119_CLEAN_load2(const unsigned char * /*src*/); + +uint32_t PQCLEAN_MCELIECE6960119_CLEAN_load4(const unsigned char * /*in*/); + +void PQCLEAN_MCELIECE6960119_CLEAN_store8(unsigned char * /*out*/, uint64_t /*in*/); +uint64_t PQCLEAN_MCELIECE6960119_CLEAN_load8(const unsigned char * /*in*/); + +gf PQCLEAN_MCELIECE6960119_CLEAN_bitrev(gf /*a*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/sse/LICENSE b/crypto_kem/mceliece6960119/sse/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece6960119/sse/Makefile b/crypto_kem/mceliece6960119/sse/Makefile new file mode 100644 index 00000000..8fc16cf3 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/Makefile @@ -0,0 +1,37 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece6960119_sse.a + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c operations.c pk_gen.c sk_gen.c util.c vec128.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S update_asm.S \ + vec128_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h params.h \ + pk_gen.h sk_gen.h transpose.h util.h vec128.h \ + consts.inc scalars_2x.inc scalars_4x.inc + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o operations.o pk_gen.o sk_gen.o util.o vec128.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + update_asm.o vec128_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mpopcnt -msse4.1 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece6960119/sse/aes256ctr.c b/crypto_kem/mceliece6960119/sse/aes256ctr.c new file mode 100644 index 00000000..c918f72c --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE6960119_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece6960119/sse/aes256ctr.h b/crypto_kem/mceliece6960119/sse/aes256ctr.h new file mode 100644 index 00000000..66f5037e --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_AES256CTR_H +#define PQCLEAN_MCELIECE6960119_SSE_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6960119_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6960119/sse/api.h b/crypto_kem/mceliece6960119/sse/api.h new file mode 100644 index 00000000..eb17af0a --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_API_H +#define PQCLEAN_MCELIECE6960119_SSE_API_H + +#include + +#define PQCLEAN_MCELIECE6960119_SSE_CRYPTO_ALGNAME "Classic McEliece 6960119" +#define PQCLEAN_MCELIECE6960119_SSE_CRYPTO_PUBLICKEYBYTES 1047319 +#define PQCLEAN_MCELIECE6960119_SSE_CRYPTO_SECRETKEYBYTES 13908 +#define PQCLEAN_MCELIECE6960119_SSE_CRYPTO_CIPHERTEXTBYTES 226 +#define PQCLEAN_MCELIECE6960119_SSE_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6960119_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6960119_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6960119_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece6960119/sse/benes.c b/crypto_kem/mceliece6960119/sse/benes.c new file mode 100644 index 00000000..855694fb --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE6960119_SSE_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE6960119_SSE_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(PQCLEAN_MCELIECE6960119_SSE_load8(ptr), PQCLEAN_MCELIECE6960119_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6960119_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE6960119_SSE_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(PQCLEAN_MCELIECE6960119_SSE_load8(ptr), PQCLEAN_MCELIECE6960119_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6960119_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6960119_SSE_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE6960119_SSE_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6960119_SSE_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6960119_SSE_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE6960119_SSE_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece6960119/sse/benes.h b/crypto_kem/mceliece6960119/sse/benes.h new file mode 100644 index 00000000..6b5aa09f --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_BENES_H +#define PQCLEAN_MCELIECE6960119_SSE_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE6960119_SSE_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE6960119_SSE_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/sse/bm.c b/crypto_kem/mceliece6960119/sse/bm.c new file mode 100644 index 00000000..4b64ab8d --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/bm.c @@ -0,0 +1,204 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +extern gf PQCLEAN_MCELIECE6960119_SSE_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE6960119_SSE_update_asm(vec128 *, gf); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 *out, vec128 *in, uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE6960119_SSE_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE6960119_SSE_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6960119_SSE_vec128_and(in[i], m0); + v1 = PQCLEAN_MCELIECE6960119_SSE_vec128_and(out[i], m1); + out[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_or(v0, v1); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE6960119_SSE_vec128_or(PQCLEAN_MCELIECE6960119_SSE_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE6960119_SSE_vec128_sll_2x(PQCLEAN_MCELIECE6960119_SSE_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE6960119_SSE_vec128_or(PQCLEAN_MCELIECE6960119_SSE_vec128_srl_2x(PQCLEAN_MCELIECE6960119_SSE_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE6960119_SSE_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE6960119_SSE_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6960119_SSE_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6960119_SSE_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6960119_SSE_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6960119_SSE_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6960119_SSE_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6960119_SSE_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6960119_SSE_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119_SSE_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119_SSE_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6960119_SSE_bm(vec128 *out, vec128 in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + uint64_t v[2]; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + vec128 dd[ GFBITS ], bb[ GFBITS ]; + vec128 B[ GFBITS ], C[ GFBITS ]; + vec128 B_tmp[ GFBITS ], C_tmp[ GFBITS ]; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[128], in[1]); + + C[0] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0, one << 63); + B[0] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0, one << 62); + + for (i = 1; i < GFBITS; i++) { + C[i] = B[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_setzero(); + } + + for (N = 0; N < SYS_T * 2; N++) { + PQCLEAN_MCELIECE6960119_SSE_update_asm(interval, coefs[N]); + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(prod, C, (vec128 *) interval); + d = PQCLEAN_MCELIECE6960119_SSE_vec_reduce_asm(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_setbits((d >> i) & 1); + bb[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(B_tmp, dd, B); + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(C_tmp, bb, C); + + vec128_cmov(B, C, mask); + PQCLEAN_MCELIECE6960119_SSE_update_asm(B, 0); + + for (i = 0; i < GFBITS; i++) { + C[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(B_tmp[i], C_tmp[i]); + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(C[i], 0); + v[1] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(C[i], 1); + + out[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x((v[0] >> 8) | (v[1] << 56), v[1] >> 8); + } +} + diff --git a/crypto_kem/mceliece6960119/sse/bm.h b/crypto_kem/mceliece6960119/sse/bm.h new file mode 100644 index 00000000..bed133a5 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/bm.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_BM_H +#define PQCLEAN_MCELIECE6960119_SSE_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE6960119_SSE_bm(vec128 * /*out*/, vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6960119/sse/consts.S b/crypto_kem/mceliece6960119/sse/consts.S new file mode 100644 index 00000000..bbf6767c --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/consts.S @@ -0,0 +1,32 @@ +.data + +# not supported on macos +#.section .rodata +.globl PQCLEAN_MCELIECE6960119_SSE_MASK0_0 +.globl PQCLEAN_MCELIECE6960119_SSE_MASK0_1 +.globl PQCLEAN_MCELIECE6960119_SSE_MASK1_0 +.globl PQCLEAN_MCELIECE6960119_SSE_MASK1_1 +.globl PQCLEAN_MCELIECE6960119_SSE_MASK2_0 +.globl PQCLEAN_MCELIECE6960119_SSE_MASK2_1 +.globl PQCLEAN_MCELIECE6960119_SSE_MASK3_0 +.globl PQCLEAN_MCELIECE6960119_SSE_MASK3_1 +.globl PQCLEAN_MCELIECE6960119_SSE_MASK4_0 +.globl PQCLEAN_MCELIECE6960119_SSE_MASK4_1 +.globl PQCLEAN_MCELIECE6960119_SSE_MASK5_0 +.globl PQCLEAN_MCELIECE6960119_SSE_MASK5_1 + +.p2align 4 + +PQCLEAN_MCELIECE6960119_SSE_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE6960119_SSE_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE6960119_SSE_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE6960119_SSE_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE6960119_SSE_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE6960119_SSE_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE6960119_SSE_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE6960119_SSE_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE6960119_SSE_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE6960119_SSE_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE6960119_SSE_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE6960119_SSE_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece6960119/sse/consts.inc b/crypto_kem/mceliece6960119/sse/consts.inc new file mode 100644 index 00000000..a839b8a6 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/consts.inc @@ -0,0 +1,967 @@ +// 64 +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +// 128 +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), +}, +// 256 +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9669966969966996, 0X6996699696699669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +// 512 +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +// 1024 +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +// 2048 +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +// 4096 +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +} diff --git a/crypto_kem/mceliece6960119/sse/controlbits.c b/crypto_kem/mceliece6960119/sse/controlbits.c new file mode 100644 index 00000000..8b47ffd2 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6960119_SSE_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6960119_SSE_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6960119_SSE_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6960119_SSE_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6960119/sse/controlbits.h b/crypto_kem/mceliece6960119/sse/controlbits.h new file mode 100644 index 00000000..1217b3d4 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_CONTROLBITS_H +#define PQCLEAN_MCELIECE6960119_SSE_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6960119_SSE_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6960119_SSE_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6960119/sse/crypto_hash.h b/crypto_kem/mceliece6960119/sse/crypto_hash.h new file mode 100644 index 00000000..d8b9eb7d --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6960119_SSE_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6960119/sse/decrypt.c b/crypto_kem/mceliece6960119/sse/decrypt.c new file mode 100644 index 00000000..58ed88dd --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/decrypt.c @@ -0,0 +1,207 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include +#include + +static void scaling(vec128 out[][GFBITS], vec128 inv[][GFBITS], const unsigned char *sk, vec128 *recv) { + int i, j; + + vec128 irr_int[ GFBITS ]; + vec128 eval[64][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE6960119_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6960119_SSE_fft(eval, irr_int); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE6960119_SSE_vec128_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6960119_SSE_vec128_copy(inv[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119_SSE_vec128_inv(tmp, inv[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119_SSE_vec128_copy(inv[0], tmp); + + // + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6960119_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + r[i - 1] &= (1 << ((GFBITS * SYS_T) % 8)) - 1; // throwing away redundant bits + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE6960119_SSE_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE6960119_SSE_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE6960119_SSE_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec128 out[][GFBITS], vec128 inv[][GFBITS], vec128 *recv) { + int i, j; + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6960119_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64(PQCLEAN_MCELIECE6960119_SSE_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6960119_SSE_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u32( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec128 s0[][ GFBITS ], vec128 s1[][ GFBITS ]) { + int i, j; + vec128 diff; + + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_or(PQCLEAN_MCELIECE6960119_SSE_vec128_xor(s0[0][0], s1[0][0]), + PQCLEAN_MCELIECE6960119_SSE_vec128_xor(s0[1][0], s1[1][0])); + + for (i = 0; i < 2; i++) { + for (j = 1; j < GFBITS; j++) { + diff = PQCLEAN_MCELIECE6960119_SSE_vec128_or(diff, PQCLEAN_MCELIECE6960119_SSE_vec128_xor(s0[i][j], s1[i][j])); + } + } + + return (uint16_t)PQCLEAN_MCELIECE6960119_SSE_vec128_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6960119_SSE_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec128 inv[ 64 ][ GFBITS ]; + vec128 scaled[ 64 ][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + + vec128 error[ 64 ]; + + vec128 s_priv[ 2 ][ GFBITS ]; + vec128 s_priv_cmp[ 2 ][ GFBITS ]; + + vec128 locator[ GFBITS ]; + + vec128 recv[ 64 ]; + vec128 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE6960119_SSE_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE6960119_SSE_benes(recv, bits_int, 1); + + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE6960119_SSE_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6960119_SSE_bm(locator, s_priv); + + PQCLEAN_MCELIECE6960119_SSE_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6960119_SSE_vec128_setbits(1); + + for (i = 0; i < 64; i++) { + error[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_or_reduce(eval[i]); + error[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(error[i], allone); + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE6960119_SSE_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE6960119_SSE_benes(error, bits_int, 0); + + postprocess(e, error); + + check_weight = weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece6960119/sse/decrypt.h b/crypto_kem/mceliece6960119/sse/decrypt.h new file mode 100644 index 00000000..78a7d2b2 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_DECRYPT_H +#define PQCLEAN_MCELIECE6960119_SSE_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6960119_SSE_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/sse/encrypt.c b/crypto_kem/mceliece6960119/sse/encrypt.c new file mode 100644 index 00000000..6d924145 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/encrypt.c @@ -0,0 +1,105 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE6960119_SSE_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind[ SYS_T * 2 ]; + uint32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *)ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind32[i] == ind32[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6960119_SSE_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6960119_SSE_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + PQCLEAN_MCELIECE6960119_SSE_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece6960119/sse/encrypt.h b/crypto_kem/mceliece6960119/sse/encrypt.h new file mode 100644 index 00000000..a1122b31 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_ENCRYPT_H +#define PQCLEAN_MCELIECE6960119_SSE_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6960119_SSE_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/sse/fft.c b/crypto_kem/mceliece6960119/sse/fft.c new file mode 100644 index 00000000..428d7eeb --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/fft.c @@ -0,0 +1,231 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec128.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE6960119_SSE_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE6960119_SSE_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(in, in, s[j]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec128 out[][ GFBITS ], const vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec128 t[ GFBITS ]; + vec128 pre[8][ GFBITS ]; + vec128 buf[64]; + + uint64_t v0, v1; + uint64_t consts_ptr = 1; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre[i + 0][j] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_low(tmp[j], tmp[j]); + pre[i + 1][j] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(PQCLEAN_MCELIECE6960119_SSE_vec128_extract(in[i], 0), + PQCLEAN_MCELIECE6960119_SSE_vec128_extract(in[i], 0) ^ PQCLEAN_MCELIECE6960119_SSE_vec128_extract(pre[6][i], 0)); + + buf[1] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[0], pre[0][i]); + buf[16] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[0], pre[4][i]); + buf[3] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[1], pre[1][i]); + buf[48] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[16], pre[5][i]); + buf[49] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[48], pre[0][i]); + buf[2] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[0], pre[1][i]); + buf[51] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[49], pre[1][i]); + buf[6] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[2], pre[2][i]); + buf[50] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[51], pre[0][i]); + buf[7] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[6], pre[0][i]); + buf[54] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[50], pre[2][i]); + buf[5] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[7], pre[1][i]); + buf[55] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[54], pre[0][i]); + buf[53] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[55], pre[1][i]); + buf[4] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[0], pre[2][i]); + buf[52] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[53], pre[0][i]); + buf[12] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[4], pre[3][i]); + buf[60] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[52], pre[3][i]); + buf[13] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[12], pre[0][i]); + buf[61] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[60], pre[0][i]); + buf[15] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[13], pre[1][i]); + buf[63] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[61], pre[1][i]); + buf[14] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[15], pre[0][i]); + buf[62] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[63], pre[0][i]); + buf[10] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[14], pre[2][i]); + buf[58] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[62], pre[2][i]); + buf[11] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[10], pre[0][i]); + buf[59] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[58], pre[0][i]); + buf[9] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[11], pre[1][i]); + buf[57] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[59], pre[1][i]); + buf[56] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[57], pre[0][i]); + buf[8] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[0], pre[3][i]); + buf[40] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[56], pre[4][i]); + buf[24] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[8], pre[4][i]); + buf[41] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[40], pre[0][i]); + buf[25] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[24], pre[0][i]); + buf[43] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[41], pre[1][i]); + buf[27] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[25], pre[1][i]); + buf[42] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[43], pre[0][i]); + buf[26] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[27], pre[0][i]); + buf[46] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[42], pre[2][i]); + buf[30] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[26], pre[2][i]); + buf[47] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[46], pre[0][i]); + buf[31] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[30], pre[0][i]); + buf[45] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[47], pre[1][i]); + buf[29] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[31], pre[1][i]); + buf[44] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[45], pre[0][i]); + buf[28] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[29], pre[0][i]); + buf[36] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[44], pre[3][i]); + buf[20] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[28], pre[3][i]); + buf[37] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[36], pre[0][i]); + buf[21] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[20], pre[0][i]); + buf[39] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[37], pre[1][i]); + buf[23] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[21], pre[1][i]); + buf[38] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[39], pre[0][i]); + buf[22] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[23], pre[0][i]); + buf[34] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[38], pre[2][i]); + buf[18] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[22], pre[2][i]); + buf[35] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[34], pre[0][i]); + buf[19] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[18], pre[0][i]); + buf[33] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[35], pre[1][i]); + buf[17] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[19], pre[1][i]); + buf[32] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[33], pre[0][i]); + + PQCLEAN_MCELIECE6960119_SSE_transpose_64x128_sp(buf); + + for (j = 0; j < 64; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 0; i <= 5; i++) { + s = 1 << i; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(tmp, out[k + s], (vec128 *) consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(out[k ][b], tmp[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(out[k + s][b], out[k][b]); + } + } + } + + consts_ptr += (1 << i); + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6960119_SSE_fft(vec128 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece6960119/sse/fft.h b/crypto_kem/mceliece6960119/sse/fft.h new file mode 100644 index 00000000..79ff5101 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_FFT_H +#define PQCLEAN_MCELIECE6960119_SSE_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include + +void PQCLEAN_MCELIECE6960119_SSE_fft(vec128 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/sse/fft_tr.c b/crypto_kem/mceliece6960119/sse/fft_tr.c new file mode 100644 index 00000000..5e185806 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/fft_tr.c @@ -0,0 +1,355 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec128 in[][ GFBITS ]) { + int i, j, k; + vec128 t, x0, x1; + + uint64_t v0, v1, v2, v3; + + const vec128 mask[6][2] = { + { + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec128 s[6][2][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(in[1], in[1], s[j][1]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE6960119_SSE_vec128_and(in[0][i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE6960119_SSE_vec128_and(in[0][i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE6960119_SSE_vec128_and(in[1][i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[1][i], t); + + t = PQCLEAN_MCELIECE6960119_SSE_vec128_and(in[1][i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[1][i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + x0 = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_low(in[0][i], in[1][i]); + x1 = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_high(in[0][i], in[1][i]); + + x1 = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(x1, PQCLEAN_MCELIECE6960119_SSE_vec128_srl_2x(x0, 32)); + x1 = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(x1, PQCLEAN_MCELIECE6960119_SSE_vec128_sll_2x(x1, 32)); + + in[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_low(x0, x1); + in[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_high(x0, x1); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(in[0][i], 0); + v1 = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(in[0][i], 1); + v2 = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(in[1][i], 0); + v3 = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(in[1][i], 1); + + v3 ^= v2 ^= v1; + + in[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(v0, v1); + in[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(v2, v3); + } + + } +} + +static void butterflies_tr(vec128 out[][ GFBITS ], vec128 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec128 tmp0[ GFBITS ]; + vec128 tmp1[ GFBITS ]; + vec128 tmp[ GFBITS ]; + + vec128 pre[ 6 ][ GFBITS ]; + vec128 buf[ 64 ]; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 64; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 5; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[k][b], in[k + s][b]); + } + + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[k + s][b], tmp[b]); + } + } + } + } + + for (j = 0; j < 64; j += 2) { + for (i = 0; i < GFBITS; i++) { + tmp0[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_low(in[j][i], in[j + 1][i]); + tmp1[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_high(in[j][i], in[j + 1][i]); + } + + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(tmp0[b], tmp1[b]); + } + + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(tmp, tmp0, consts[0]); + + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(tmp1[b], tmp[b]); + } + + for (i = 0; i < GFBITS; i++) { + in[j + 0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_low(tmp0[i], tmp1[i]); + in[j + 1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_high(tmp0[i], tmp1[i]); + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 64; k++) { + buf[k] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE6960119_SSE_transpose_64x128_sp(buf); + + pre[0][i] = buf[32]; + buf[33] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[33], buf[32]); + pre[1][i] = buf[33]; + buf[35] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[35], buf[33]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[35]); + buf[34] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[34], buf[35]); + pre[2][i] = buf[34]; + buf[38] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[38], buf[34]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[38]); + buf[39] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[39], buf[38]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[39]); + buf[37] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[37], buf[39]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[37]); + buf[36] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[36], buf[37]); + pre[3][i] = buf[36]; + buf[44] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[44], buf[36]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[44]); + buf[45] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[45], buf[44]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[45]); + buf[47] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[47], buf[45]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[47]); + buf[46] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[46], buf[47]); + pre[2][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[2][i], buf[46]); + buf[42] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[42], buf[46]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[42]); + buf[43] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[43], buf[42]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[43]); + buf[41] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[41], buf[43]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[41]); + buf[40] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[40], buf[41]); + pre[4][i] = buf[40]; + buf[56] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[56], buf[40]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[56]); + buf[57] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[57], buf[56]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[57]); + buf[59] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[59], buf[57]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[59]); + buf[58] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[58], buf[59]); + pre[2][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[2][i], buf[58]); + buf[62] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[62], buf[58]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[62]); + buf[63] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[63], buf[62]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[63]); + buf[61] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[61], buf[63]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[61]); + buf[60] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[60], buf[61]); + pre[3][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[3][i], buf[60]); + buf[52] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[52], buf[60]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[52]); + buf[53] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[53], buf[52]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[53]); + buf[55] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[55], buf[53]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[55]); + buf[54] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[54], buf[55]); + pre[2][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[2][i], buf[54]); + buf[50] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[50], buf[54]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[50]); + buf[51] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[51], buf[50]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[51]); + buf[49] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[49], buf[51]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[49]); + buf[48] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[48], buf[49]); + pre[5][i] = buf[48]; + buf[16] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[16], buf[48]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[16]); + buf[17] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[17], buf[16]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[17]); + buf[19] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[19], buf[17]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[19]); + buf[18] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[18], buf[19]); + pre[2][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[2][i], buf[18]); + buf[22] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[22], buf[18]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[22]); + buf[23] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[23], buf[22]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[23]); + buf[21] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[21], buf[23]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[21]); + buf[20] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[20], buf[21]); + pre[3][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[3][i], buf[20]); + buf[28] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[28], buf[20]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[28]); + buf[29] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[29], buf[28]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[29]); + buf[31] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[31], buf[29]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[31]); + buf[30] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[30], buf[31]); + pre[2][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[2][i], buf[30]); + buf[26] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[26], buf[30]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[26]); + buf[27] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[27], buf[26]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[27]); + buf[25] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[25], buf[27]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[25]); + buf[24] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[24], buf[25]); + pre[4][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[4][i], buf[24]); + buf[8] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[8], buf[24]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[8]); + buf[9] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[9], buf[8]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[9]); + buf[11] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[11], buf[9]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[11]); + buf[10] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[10], buf[11]); + pre[2][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[2][i], buf[10]); + buf[14] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[14], buf[10]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[14]); + buf[15] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[15], buf[14]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[15]); + buf[13] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[13], buf[15]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[13]); + buf[12] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[12], buf[13]); + pre[3][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[3][i], buf[12]); + buf[4] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[4], buf[12]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[4]); + buf[5] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[5], buf[4]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[5]); + buf[7] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[7], buf[5]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[7]); + buf[6] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[6], buf[7]); + pre[2][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[2][i], buf[6]); + buf[2] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[2], buf[6]); + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[2]); + buf[3] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[3], buf[2]); + pre[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[1][i], buf[3]); + buf[1] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[1], buf[3]); + + pre[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(pre[0][i], buf[1]); + out[0][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(buf[0], buf[1]); + + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119_SSE_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(out[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119_SSE_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(tmp, pre[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out[1][b] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(out[1][b], tmp[b]); + } + } +} + +/* justifying the length of the output */ +static void postprocess(vec128 out[][GFBITS]) { + int i; + uint64_t v[2]; + + for (i = 0; i < 13; i++) { + v[0] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(out[1][i], 0); + v[1] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(out[1][i], 1); + + v[1] <<= (128 - SYS_T) * 2; + v[1] >>= (128 - SYS_T) * 2; + + out[1][i] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6960119_SSE_fft_tr(vec128 out[][GFBITS], vec128 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/crypto_kem/mceliece6960119/sse/fft_tr.h b/crypto_kem/mceliece6960119/sse/fft_tr.h new file mode 100644 index 00000000..44ced9d7 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_FFT_TR_H +#define PQCLEAN_MCELIECE6960119_SSE_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE6960119_SSE_fft_tr(vec128 /*out*/[][GFBITS], vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6960119/sse/gf.c b/crypto_kem/mceliece6960119/sse/gf.c new file mode 100644 index 00000000..61bdcdb8 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/gf.c @@ -0,0 +1,203 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6960119_SSE_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE6960119_SSE_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6960119_SSE_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6960119_SSE_gf_inv(gf in) { + return PQCLEAN_MCELIECE6960119_SSE_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6960119_SSE_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[237]; + + for (i = 0; i < 237; i++) { + prod[i] = 0; + } + + for (i = 0; i < 119; i++) { + for (j = 0; j < 119; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6960119_SSE_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 236; i >= 119; i--) { + prod[i - 117] ^= PQCLEAN_MCELIECE6960119_SSE_gf_mul(prod[i], (gf) 6400); + prod[i - 119] ^= PQCLEAN_MCELIECE6960119_SSE_gf_mul(prod[i], (gf) 3134); + } + + for (i = 0; i < 119; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6960119/sse/gf.h b/crypto_kem/mceliece6960119/sse/gf.h new file mode 100644 index 00000000..f7eeba37 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_GF_H +#define PQCLEAN_MCELIECE6960119_SSE_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6960119_SSE_gf_iszero(gf a); +gf PQCLEAN_MCELIECE6960119_SSE_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE6960119_SSE_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE6960119_SSE_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE6960119_SSE_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE6960119_SSE_gf_inv(gf in); + +void PQCLEAN_MCELIECE6960119_SSE_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece6960119/sse/operations.c b/crypto_kem/mceliece6960119/sse/operations.c new file mode 100644 index 00000000..955f9fa3 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6960119_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6960119_SSE_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6960119_SSE_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6960119_SSE_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6960119_SSE_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6960119_SSE_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6960119_SSE_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6960119_SSE_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6960119_SSE_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6960119_SSE_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6960119_SSE_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119/sse/params.h b/crypto_kem/mceliece6960119/sse/params.h new file mode 100644 index 00000000..8c0b8674 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_PARAMS_H +#define PQCLEAN_MCELIECE6960119_SSE_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6960 +#define SYS_T 119 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6960119/sse/pk_gen.c b/crypto_kem/mceliece6960119/sse/pk_gen.c new file mode 100644 index 00000000..a211efbc --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/pk_gen.c @@ -0,0 +1,275 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec128 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 128 + 0 * 64 + r] <<= 1; + out[i * 128 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 128 + 1 * 64 + r] <<= 1; + out[i * 128 + 1 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec128 out0[][GFBITS], vec128 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[2] = {0}; + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(u[0], u[1]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(u[0], u[1]); + } + } +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 127) / 128) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +#define NBLOCKS2_I ((GFBITS * SYS_T + 127) / 128) +int PQCLEAN_MCELIECE6960119_SSE_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS1_I - 1; + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 2 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS1_I ]; + + uint64_t mask; + + vec128 irr_int[ GFBITS ]; + + vec128 consts[64][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + vec128 prod[ 64 ][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE6960119_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6960119_SSE_fft(eval, irr_int); + + PQCLEAN_MCELIECE6960119_SSE_vec128_copy(prod[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119_SSE_vec128_inv(tmp, prod[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119_SSE_vec128_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6960119_SSE_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_I; j++) { + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS1_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + uint64_t column[ PK_NROWS ]; + + for (i = 0; i < PK_NROWS; i++) { + column[i] = mat[ i ][ block_idx ]; + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE6960119_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + + for (i = 0; i < PK_NROWS; i++) { + mat[ i ][ block_idx ] = column[i]; + } + + for (row = 0; row < PK_NROWS; row++) { + + for (k = 0; k < NBLOCKS1_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + one_row[k] = (one_row[k] >> tail) | (one_row[k + 1] << (64 - tail)); + PQCLEAN_MCELIECE6960119_SSE_store8(pk, one_row[k]); + pk += 8; + } + + one_row[k] >>= tail; + PQCLEAN_MCELIECE6960119_SSE_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk[ (PK_ROW_BYTES % 8) - 1 ] &= (1 << (PK_NCOLS % 8)) - 1; // removing redundant bits + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece6960119/sse/pk_gen.h b/crypto_kem/mceliece6960119/sse/pk_gen.h new file mode 100644 index 00000000..3bdbaf0b --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_PK_GEN_H +#define PQCLEAN_MCELIECE6960119_SSE_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6960119_SSE_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/sse/scalars_2x.inc b/crypto_kem/mceliece6960119/sse/scalars_2x.inc new file mode 100644 index 00000000..d67fe251 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece6960119/sse/scalars_4x.inc b/crypto_kem/mceliece6960119/sse/scalars_4x.inc new file mode 100644 index 00000000..19902479 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/scalars_4x.inc @@ -0,0 +1,181 @@ +{{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF), +}}, +{{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FF00000000FF, 0X0000FF000000FFFF), +}}, +{{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), +}}, +{{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), +}} + diff --git a/crypto_kem/mceliece6960119/sse/sk_gen.c b/crypto_kem/mceliece6960119/sse/sk_gen.c new file mode 100644 index 00000000..96dc35e3 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6960119_SSE_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6960119_SSE_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6960119_SSE_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6960119_SSE_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6960119_SSE_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6960119_SSE_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6960119_SSE_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6960119_SSE_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119/sse/sk_gen.h b/crypto_kem/mceliece6960119/sse/sk_gen.h new file mode 100644 index 00000000..134954c0 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_SK_GEN_H +#define PQCLEAN_MCELIECE6960119_SSE_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6960119_SSE_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6960119_SSE_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/sse/syndrome_asm.S b/crypto_kem/mceliece6960119/sse/syndrome_asm.S new file mode 100644 index 00000000..c764108c --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/syndrome_asm.S @@ -0,0 +1,1311 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: reg128 pp + +# qhasm: reg128 ee + +# qhasm: reg128 ss + +# qhasm: int64 b0 + +# qhasm: int64 b1 + +# qhasm: int64 i + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: int64 tmp + +# qhasm: stack64 back + +# qhasm: int64 buf_ptr + +# qhasm: stack128 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119_SSE_syndrome_asm +.global PQCLEAN_MCELIECE6960119_SSE_syndrome_asm +_PQCLEAN_MCELIECE6960119_SSE_syndrome_asm: +PQCLEAN_MCELIECE6960119_SSE_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $64,%r11 +sub %r11,%rsp + +# qhasm: input_2 += 193 +# asm 1: add $193,tmp=int64#4 +# asm 2: movzbq 0(tmp=%rcx +movzbq 0(%rdx),%rcx + +# qhasm: back = tmp +# asm 1: movq back=stack64#1 +# asm 2: movq back=32(%rsp) +movq %rcx,32(%rsp) + +# qhasm: i = 0 +# asm 1: mov $0,>i=int64#4 +# asm 2: mov $0,>i=%rcx +mov $0,%rcx + +# qhasm: inner1: +._inner1: + +# qhasm: addr = input_2 + i +# asm 1: lea (addr=int64#5 +# asm 2: lea (addr=%r8 +lea (%rdx,%rcx),%r8 + +# qhasm: b0 = *(uint8 *) (addr + 0) +# asm 1: movzbq 0(b0=int64#6 +# asm 2: movzbq 0(b0=%r9 +movzbq 0(%r8),%r9 + +# qhasm: b1 = *(uint8 *) (addr + 1) +# asm 1: movzbq 1(b1=int64#7 +# asm 2: movzbq 1(b1=%rax +movzbq 1(%r8),%rax + +# qhasm: (uint64) b0 >>= 3 +# asm 1: shr $3,b0=int64#4 +# asm 2: movzbq 1(b0=%rcx +movzbq 1(%r8),%rcx + +# qhasm: (uint64) b0 >>= 3 +# asm 1: shr $3,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1547 +# asm 1: mov $1547,>row=int64#5 +# asm 2: mov $1547,>row=%r8 +mov $1547,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rsi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(ee=reg128#2 +# asm 2: movdqu 0(ee=%xmm1 +movdqu 0(%rdx),%xmm1 + +# qhasm: ss &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 16(pp=%xmm1 +movdqu 16(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 16 ] +# asm 1: movdqu 16(ee=reg128#3 +# asm 2: movdqu 16(ee=%xmm2 +movdqu 16(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 32(pp=%xmm1 +movdqu 32(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 32 ] +# asm 1: movdqu 32(ee=reg128#3 +# asm 2: movdqu 32(ee=%xmm2 +movdqu 32(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 48(pp=%xmm1 +movdqu 48(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 48 ] +# asm 1: movdqu 48(ee=reg128#3 +# asm 2: movdqu 48(ee=%xmm2 +movdqu 48(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 64(pp=%xmm1 +movdqu 64(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 64 ] +# asm 1: movdqu 64(ee=reg128#3 +# asm 2: movdqu 64(ee=%xmm2 +movdqu 64(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 80(pp=%xmm1 +movdqu 80(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 80 ] +# asm 1: movdqu 80(ee=reg128#3 +# asm 2: movdqu 80(ee=%xmm2 +movdqu 80(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 96(pp=%xmm1 +movdqu 96(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 96 ] +# asm 1: movdqu 96(ee=reg128#3 +# asm 2: movdqu 96(ee=%xmm2 +movdqu 96(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 112(pp=%xmm1 +movdqu 112(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 112 ] +# asm 1: movdqu 112(ee=reg128#3 +# asm 2: movdqu 112(ee=%xmm2 +movdqu 112(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 128(pp=%xmm1 +movdqu 128(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 128 ] +# asm 1: movdqu 128(ee=reg128#3 +# asm 2: movdqu 128(ee=%xmm2 +movdqu 128(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 144(pp=%xmm1 +movdqu 144(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 144 ] +# asm 1: movdqu 144(ee=reg128#3 +# asm 2: movdqu 144(ee=%xmm2 +movdqu 144(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 160(pp=%xmm1 +movdqu 160(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 160 ] +# asm 1: movdqu 160(ee=reg128#3 +# asm 2: movdqu 160(ee=%xmm2 +movdqu 160(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 176(pp=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 176 ] +# asm 1: movdqu 176(ee=reg128#3 +# asm 2: movdqu 176(ee=%xmm2 +movdqu 176(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 192(pp=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 192 ] +# asm 1: movdqu 192(ee=reg128#3 +# asm 2: movdqu 192(ee=%xmm2 +movdqu 192(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 208(pp=%xmm1 +movdqu 208(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 208 ] +# asm 1: movdqu 208(ee=reg128#3 +# asm 2: movdqu 208(ee=%xmm2 +movdqu 208(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 224(pp=%xmm1 +movdqu 224(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 224 ] +# asm 1: movdqu 224(ee=reg128#3 +# asm 2: movdqu 224(ee=%xmm2 +movdqu 224(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 240(pp=%xmm1 +movdqu 240(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 240 ] +# asm 1: movdqu 240(ee=reg128#3 +# asm 2: movdqu 240(ee=%xmm2 +movdqu 240(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 256(pp=%xmm1 +movdqu 256(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 256 ] +# asm 1: movdqu 256(ee=reg128#3 +# asm 2: movdqu 256(ee=%xmm2 +movdqu 256(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 272(pp=%xmm1 +movdqu 272(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 272 ] +# asm 1: movdqu 272(ee=reg128#3 +# asm 2: movdqu 272(ee=%xmm2 +movdqu 272(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 288(pp=%xmm1 +movdqu 288(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 288 ] +# asm 1: movdqu 288(ee=reg128#3 +# asm 2: movdqu 288(ee=%xmm2 +movdqu 288(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 304(pp=%xmm1 +movdqu 304(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 304 ] +# asm 1: movdqu 304(ee=reg128#3 +# asm 2: movdqu 304(ee=%xmm2 +movdqu 304(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 320(pp=%xmm1 +movdqu 320(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 320 ] +# asm 1: movdqu 320(ee=reg128#3 +# asm 2: movdqu 320(ee=%xmm2 +movdqu 320(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 336(pp=%xmm1 +movdqu 336(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 336 ] +# asm 1: movdqu 336(ee=reg128#3 +# asm 2: movdqu 336(ee=%xmm2 +movdqu 336(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 352(pp=%xmm1 +movdqu 352(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 352 ] +# asm 1: movdqu 352(ee=reg128#3 +# asm 2: movdqu 352(ee=%xmm2 +movdqu 352(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 368(pp=%xmm1 +movdqu 368(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 368 ] +# asm 1: movdqu 368(ee=reg128#3 +# asm 2: movdqu 368(ee=%xmm2 +movdqu 368(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 384(pp=%xmm1 +movdqu 384(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 384 ] +# asm 1: movdqu 384(ee=reg128#3 +# asm 2: movdqu 384(ee=%xmm2 +movdqu 384(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 400(pp=%xmm1 +movdqu 400(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 400 ] +# asm 1: movdqu 400(ee=reg128#3 +# asm 2: movdqu 400(ee=%xmm2 +movdqu 400(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 416(pp=%xmm1 +movdqu 416(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 416 ] +# asm 1: movdqu 416(ee=reg128#3 +# asm 2: movdqu 416(ee=%xmm2 +movdqu 416(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 432(pp=%xmm1 +movdqu 432(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 432 ] +# asm 1: movdqu 432(ee=reg128#3 +# asm 2: movdqu 432(ee=%xmm2 +movdqu 432(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 448(pp=%xmm1 +movdqu 448(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 448 ] +# asm 1: movdqu 448(ee=reg128#3 +# asm 2: movdqu 448(ee=%xmm2 +movdqu 448(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 464(pp=%xmm1 +movdqu 464(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 464 ] +# asm 1: movdqu 464(ee=reg128#3 +# asm 2: movdqu 464(ee=%xmm2 +movdqu 464(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 480(pp=%xmm1 +movdqu 480(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 480 ] +# asm 1: movdqu 480(ee=reg128#3 +# asm 2: movdqu 480(ee=%xmm2 +movdqu 480(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 496(pp=%xmm1 +movdqu 496(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 496 ] +# asm 1: movdqu 496(ee=reg128#3 +# asm 2: movdqu 496(ee=%xmm2 +movdqu 496(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 512(pp=%xmm1 +movdqu 512(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 512 ] +# asm 1: movdqu 512(ee=reg128#3 +# asm 2: movdqu 512(ee=%xmm2 +movdqu 512(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 528(pp=%xmm1 +movdqu 528(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 528 ] +# asm 1: movdqu 528(ee=reg128#3 +# asm 2: movdqu 528(ee=%xmm2 +movdqu 528(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 544(pp=%xmm1 +movdqu 544(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 544 ] +# asm 1: movdqu 544(ee=reg128#3 +# asm 2: movdqu 544(ee=%xmm2 +movdqu 544(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 560(pp=%xmm1 +movdqu 560(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 560 ] +# asm 1: movdqu 560(ee=reg128#3 +# asm 2: movdqu 560(ee=%xmm2 +movdqu 560(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 576(pp=%xmm1 +movdqu 576(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 576 ] +# asm 1: movdqu 576(ee=reg128#3 +# asm 2: movdqu 576(ee=%xmm2 +movdqu 576(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 592(pp=%xmm1 +movdqu 592(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 592 ] +# asm 1: movdqu 592(ee=reg128#3 +# asm 2: movdqu 592(ee=%xmm2 +movdqu 592(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 608(pp=%xmm1 +movdqu 608(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 608 ] +# asm 1: movdqu 608(ee=reg128#3 +# asm 2: movdqu 608(ee=%xmm2 +movdqu 608(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 624(pp=%xmm1 +movdqu 624(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 624 ] +# asm 1: movdqu 624(ee=reg128#3 +# asm 2: movdqu 624(ee=%xmm2 +movdqu 624(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 640(pp=%xmm1 +movdqu 640(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 640 ] +# asm 1: movdqu 640(ee=reg128#3 +# asm 2: movdqu 640(ee=%xmm2 +movdqu 640(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 656(pp=%xmm1 +movdqu 656(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 656 ] +# asm 1: movdqu 656(ee=reg128#3 +# asm 2: movdqu 656(ee=%xmm2 +movdqu 656(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand buf=stack128#1 +# asm 2: movdqa buf=0(%rsp) +movdqa %xmm0,0(%rsp) + +# qhasm: s = *(uint32 *) (input_1 + 672) +# asm 1: movl 672(s=int64#6d +# asm 2: movl 672(s=%r9d +movl 672(%rsi),%r9d + +# qhasm: e = *(uint32 *) (input_2 + 672) +# asm 1: movl 672(e=int64#7d +# asm 2: movl 672(e=%eax +movl 672(%rdx),%eax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movzbq 676(p=%rax +movzbq 676(%rsi),%rax + +# qhasm: e = *(uint8 *) (input_2 + 676) +# asm 1: movzbq 676(e=int64#8 +# asm 2: movzbq 676(e=%r10 +movzbq 676(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,i=int64#2 +# asm 2: mov $676,>i=%rsi +mov $676,%rsi + +# qhasm: inner2: +._inner2: + +# qhasm: i -= 1 +# asm 1: sub $1,addr=int64#4 +# asm 2: lea (addr=%rcx +lea (%rdx,%rsi),%rcx + +# qhasm: b0 = *(uint8 *) (addr + 0) +# asm 1: movzbq 0(b0=int64#5 +# asm 2: movzbq 0(b0=%r8 +movzbq 0(%rcx),%r8 + +# qhasm: b1 = *(uint8 *) (addr + 1) +# asm 1: movzbq 1(b1=int64#6 +# asm 2: movzbq 1(b1=%r9 +movzbq 1(%rcx),%r9 + +# qhasm: (uint64) b0 >>= 5 +# asm 1: shr $5,tmp=int64#2 +# asm 2: movq tmp=%rsi +movq 32(%rsp),%rsi + +# qhasm: *(uint8 *) (input_2 + 0) = tmp +# asm 1: movb i=int64#2 +# asm 2: mov $0,>i=%rsi +mov $0,%rsi + +# qhasm: inner3: +._inner3: + +# qhasm: s = *(uint8 *) (input_0 + 0) +# asm 1: movzbq 0(s=int64#4 +# asm 2: movzbq 0(s=%rcx +movzbq 0(%rdi),%rcx + +# qhasm: e = *(uint8 *) (input_2 + 0) +# asm 1: movzbq 0(e=int64#5 +# asm 2: movzbq 0(e=%r8 +movzbq 0(%rdx),%r8 + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movzbq 0(s=%rsi +movzbq 0(%rdi),%rsi + +# qhasm: e = *(uint8 *) (input_2 + 0) +# asm 1: movzbq 0(e=int64#3 +# asm 2: movzbq 0(e=%rdx +movzbq 0(%rdx),%rdx + +# qhasm: (uint32) e &= 7 +# asm 1: and $7,mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6960119_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6960119_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6960119_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6960119_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6960119_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6960119_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6960119_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6960119_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6960119_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6960119_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6960119_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6960119_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6960119_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6960119_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6960119_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6960119_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6960119_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6960119_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6960119_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6960119_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6960119_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6960119_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#3 +# asm 2: movq 0(s0=%rdx +movq 0(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#4 +# asm 2: movq 8(s1=%rcx +movq 8(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 16(s0=%rdx +movq 16(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(s1=int64#4 +# asm 2: movq 24(s1=%rcx +movq 24(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 32(s0=%rdx +movq 32(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(s1=int64#4 +# asm 2: movq 40(s1=%rcx +movq 40(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 48(s0=%rdx +movq 48(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(s1=int64#4 +# asm 2: movq 56(s1=%rcx +movq 56(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 64(s0=%rdx +movq 64(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(s1=int64#4 +# asm 2: movq 72(s1=%rcx +movq 72(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 80(s0=%rdx +movq 80(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(s1=int64#4 +# asm 2: movq 88(s1=%rcx +movq 88(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 96(s0=%rdx +movq 96(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(s1=int64#4 +# asm 2: movq 104(s1=%rcx +movq 104(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 112(s0=%rdx +movq 112(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(s1=int64#4 +# asm 2: movq 120(s1=%rcx +movq 120(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 128(s0=%rdx +movq 128(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(s1=int64#4 +# asm 2: movq 136(s1=%rcx +movq 136(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 144(s0=%rdx +movq 144(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(s1=int64#4 +# asm 2: movq 152(s1=%rcx +movq 152(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 160(s0=%rdx +movq 160(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(s1=int64#4 +# asm 2: movq 168(s1=%rcx +movq 168(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 176(s0=%rdx +movq 176(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(s1=int64#4 +# asm 2: movq 184(s1=%rcx +movq 184(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 192(s0=%rdx +movq 192(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(s1=int64#4 +# asm 2: movq 200(s1=%rcx +movq 200(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE6960119_SSE_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6960119_SSE_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6960119_SSE_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6960119_SSE_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6960119_SSE_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6960119_SSE_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6960119_SSE_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6960119_SSE_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE6960119_SSE_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE6960119_SSE_vec128_set2x( PQCLEAN_MCELIECE6960119_SSE_load8(in), PQCLEAN_MCELIECE6960119_SSE_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE6960119_SSE_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE6960119_SSE_store8(out + 0, PQCLEAN_MCELIECE6960119_SSE_vec128_extract(in, 0)); + PQCLEAN_MCELIECE6960119_SSE_store8(out + 8, PQCLEAN_MCELIECE6960119_SSE_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece6960119/sse/util.h b/crypto_kem/mceliece6960119/sse/util.h new file mode 100644 index 00000000..5ff907b7 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_UTIL_H +#define PQCLEAN_MCELIECE6960119_SSE_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE6960119_SSE_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE6960119_SSE_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE6960119_SSE_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE6960119_SSE_load4(const unsigned char *src); +void PQCLEAN_MCELIECE6960119_SSE_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE6960119_SSE_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE6960119_SSE_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE6960119_SSE_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE6960119_SSE_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece6960119/sse/vec128.c b/crypto_kem/mceliece6960119/sse/vec128.c new file mode 100644 index 00000000..4e720b2b --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/vec128.c @@ -0,0 +1,152 @@ +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + +#include "vec128.h" + +#include "params.h" + + +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE6960119_SSE_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE6960119_SSE_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE6960119_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE6960119_SSE_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE6960119_SSE_vec128_mul_asm(h, f, g, 16); +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE6960119_SSE_vec128_sq(vec128 *out, const vec128 *in) { + int i; + vec128 result[GFBITS], t; + + t = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[11], in[12]); + + result[0] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[0], in[11]); + result[1] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[7], t); + result[2] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[1], in[7]); + result[3] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[8], t); + result[4] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[2], in[7]); + result[4] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(result[4], in[8]); + result[4] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(result[4], t); + result[5] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[7], in[9]); + result[6] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[3], in[8]); + result[6] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(result[6], in[9]); + result[6] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(result[6], in[12]); + result[7] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[8], in[10]); + result[8] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[4], in[9]); + result[8] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(result[8], in[10]); + result[9] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[9], in[11]); + result[10] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[5], in[10]); + result[10] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(result[10], in[11]); + result[11] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[10], in[12]); + result[12] = PQCLEAN_MCELIECE6960119_SSE_vec128_xor(in[6], t); + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE6960119_SSE_vec128_inv(vec128 *out, const vec128 *in) { + vec128 tmp_11[ GFBITS ]; + vec128 tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE6960119_SSE_vec128_copy(out, in); + + PQCLEAN_MCELIECE6960119_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE6960119_SSE_vec128_sq(out, tmp_11); + PQCLEAN_MCELIECE6960119_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE6960119_SSE_vec128_sq(out, tmp_1111); + PQCLEAN_MCELIECE6960119_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE6960119_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119_SSE_vec128_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE6960119_SSE_vec128_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece6960119/sse/vec128.h b/crypto_kem/mceliece6960119/sse/vec128.h new file mode 100644 index 00000000..57de2cc6 --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/vec128.h @@ -0,0 +1,43 @@ +#ifndef PQCLEAN_MCELIECE6960119_SSE_VEC128_H +#define PQCLEAN_MCELIECE6960119_SSE_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE6960119_SSE_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE6960119_SSE_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE6960119_SSE_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE6960119_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE6960119_SSE_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE6960119_SSE_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +void PQCLEAN_MCELIECE6960119_SSE_vec128_sq(vec128 *out, const vec128 *in); +void PQCLEAN_MCELIECE6960119_SSE_vec128_inv(vec128 *out, const vec128 *in); +#endif diff --git a/crypto_kem/mceliece6960119/sse/vec128_mul_asm.S b/crypto_kem/mceliece6960119/sse/vec128_mul_asm.S new file mode 100644 index 00000000..89f2276d --- /dev/null +++ b/crypto_kem/mceliece6960119/sse/vec128_mul_asm.S @@ -0,0 +1,2127 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 a0 + +# qhasm: reg128 a1 + +# qhasm: reg128 a2 + +# qhasm: reg128 a3 + +# qhasm: reg128 a4 + +# qhasm: reg128 a5 + +# qhasm: reg128 a6 + +# qhasm: reg128 a7 + +# qhasm: reg128 a8 + +# qhasm: reg128 a9 + +# qhasm: reg128 a10 + +# qhasm: reg128 a11 + +# qhasm: reg128 a12 + +# qhasm: reg128 b0 + +# qhasm: reg128 b1 + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 r5 + +# qhasm: reg128 r6 + +# qhasm: reg128 r7 + +# qhasm: reg128 r8 + +# qhasm: reg128 r9 + +# qhasm: reg128 r10 + +# qhasm: reg128 r11 + +# qhasm: reg128 r12 + +# qhasm: reg128 r13 + +# qhasm: reg128 r14 + +# qhasm: reg128 r15 + +# qhasm: reg128 r16 + +# qhasm: reg128 r17 + +# qhasm: reg128 r18 + +# qhasm: reg128 r19 + +# qhasm: reg128 r20 + +# qhasm: reg128 r21 + +# qhasm: reg128 r22 + +# qhasm: reg128 r23 + +# qhasm: reg128 r24 + +# qhasm: reg128 r + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119_SSE_vec128_mul_asm +.global PQCLEAN_MCELIECE6960119_SSE_vec128_mul_asm +_PQCLEAN_MCELIECE6960119_SSE_vec128_mul_asm: +PQCLEAN_MCELIECE6960119_SSE_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(b0=reg128#1 +# asm 2: movdqu 0(b0=%xmm0 +movdqu 0(%rdx),%xmm0 + +# qhasm: a12 = mem128[ input_1 + 192 ] +# asm 1: movdqu 192(a12=reg128#2 +# asm 2: movdqu 192(a12=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg128#3 +# asm 2: vpand r12=%xmm2 +vpand %xmm0,%xmm1,%xmm2 + +# qhasm: r13 = a12 & mem128[input_2 + 16] +# asm 1: vpand 16(r13=reg128#4 +# asm 2: vpand 16(r13=%xmm3 +vpand 16(%rdx),%xmm1,%xmm3 + +# qhasm: r14 = a12 & mem128[input_2 + 32] +# asm 1: vpand 32(r14=reg128#5 +# asm 2: vpand 32(r14=%xmm4 +vpand 32(%rdx),%xmm1,%xmm4 + +# qhasm: r15 = a12 & mem128[input_2 + 48] +# asm 1: vpand 48(r15=reg128#6 +# asm 2: vpand 48(r15=%xmm5 +vpand 48(%rdx),%xmm1,%xmm5 + +# qhasm: r16 = a12 & mem128[input_2 + 64] +# asm 1: vpand 64(r16=reg128#7 +# asm 2: vpand 64(r16=%xmm6 +vpand 64(%rdx),%xmm1,%xmm6 + +# qhasm: r17 = a12 & mem128[input_2 + 80] +# asm 1: vpand 80(r17=reg128#8 +# asm 2: vpand 80(r17=%xmm7 +vpand 80(%rdx),%xmm1,%xmm7 + +# qhasm: r18 = a12 & mem128[input_2 + 96] +# asm 1: vpand 96(r18=reg128#9 +# asm 2: vpand 96(r18=%xmm8 +vpand 96(%rdx),%xmm1,%xmm8 + +# qhasm: r19 = a12 & mem128[input_2 + 112] +# asm 1: vpand 112(r19=reg128#10 +# asm 2: vpand 112(r19=%xmm9 +vpand 112(%rdx),%xmm1,%xmm9 + +# qhasm: r20 = a12 & mem128[input_2 + 128] +# asm 1: vpand 128(r20=reg128#11 +# asm 2: vpand 128(r20=%xmm10 +vpand 128(%rdx),%xmm1,%xmm10 + +# qhasm: r21 = a12 & mem128[input_2 + 144] +# asm 1: vpand 144(r21=reg128#12 +# asm 2: vpand 144(r21=%xmm11 +vpand 144(%rdx),%xmm1,%xmm11 + +# qhasm: r22 = a12 & mem128[input_2 + 160] +# asm 1: vpand 160(r22=reg128#13 +# asm 2: vpand 160(r22=%xmm12 +vpand 160(%rdx),%xmm1,%xmm12 + +# qhasm: r23 = a12 & mem128[input_2 + 176] +# asm 1: vpand 176(r23=reg128#14 +# asm 2: vpand 176(r23=%xmm13 +vpand 176(%rdx),%xmm1,%xmm13 + +# qhasm: r24 = a12 & mem128[input_2 + 192] +# asm 1: vpand 192(r24=reg128#2 +# asm 2: vpand 192(r24=%xmm1 +vpand 192(%rdx),%xmm1,%xmm1 + +# qhasm: r15 ^= r24 +# asm 1: pxor r11=reg128#2 +# asm 2: movdqa r11=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: a11 = mem128[ input_1 + 176 ] +# asm 1: movdqu 176(a11=reg128#15 +# asm 2: movdqu 176(a11=%xmm14 +movdqu 176(%rsi),%xmm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r22 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r23 ^= r +# asm 1: pxor r10=reg128#14 +# asm 2: movdqa r10=%xmm13 +movdqa %xmm13,%xmm13 + +# qhasm: a10 = mem128[ input_1 + 160 ] +# asm 1: movdqu 160(a10=reg128#15 +# asm 2: movdqu 160(a10=%xmm14 +movdqu 160(%rsi),%xmm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r22 ^= r +# asm 1: pxor r9=reg128#13 +# asm 2: movdqa r9=%xmm12 +movdqa %xmm12,%xmm12 + +# qhasm: a9 = mem128[ input_1 + 144 ] +# asm 1: movdqu 144(a9=reg128#15 +# asm 2: movdqu 144(a9=%xmm14 +movdqu 144(%rsi),%xmm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r21 ^= r +# asm 1: pxor r8=reg128#12 +# asm 2: movdqa r8=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: a8 = mem128[ input_1 + 128 ] +# asm 1: movdqu 128(a8=reg128#15 +# asm 2: movdqu 128(a8=%xmm14 +movdqu 128(%rsi),%xmm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r20 ^= r +# asm 1: pxor r7=reg128#11 +# asm 2: movdqa r7=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: a7 = mem128[ input_1 + 112 ] +# asm 1: movdqu 112(a7=reg128#15 +# asm 2: movdqu 112(a7=%xmm14 +movdqu 112(%rsi),%xmm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r6=reg128#10 +# asm 2: movdqa r6=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: a6 = mem128[ input_1 + 96 ] +# asm 1: movdqu 96(a6=reg128#15 +# asm 2: movdqu 96(a6=%xmm14 +movdqu 96(%rsi),%xmm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r5=reg128#9 +# asm 2: movdqa r5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: a5 = mem128[ input_1 + 80 ] +# asm 1: movdqu 80(a5=reg128#15 +# asm 2: movdqu 80(a5=%xmm14 +movdqu 80(%rsi),%xmm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r4=reg128#8 +# asm 2: movdqa r4=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: a4 = mem128[ input_1 + 64 ] +# asm 1: movdqu 64(a4=reg128#15 +# asm 2: movdqu 64(a4=%xmm14 +movdqu 64(%rsi),%xmm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r3=reg128#7 +# asm 2: movdqa r3=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: a3 = mem128[ input_1 + 48 ] +# asm 1: movdqu 48(a3=reg128#15 +# asm 2: movdqu 48(a3=%xmm14 +movdqu 48(%rsi),%xmm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r2=reg128#6 +# asm 2: movdqa r2=%xmm5 +movdqa %xmm5,%xmm5 + +# qhasm: a2 = mem128[ input_1 + 32 ] +# asm 1: movdqu 32(a2=reg128#15 +# asm 2: movdqu 32(a2=%xmm14 +movdqu 32(%rsi),%xmm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r1=reg128#5 +# asm 2: movdqa r1=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: a1 = mem128[ input_1 + 16 ] +# asm 1: movdqu 16(a1=reg128#15 +# asm 2: movdqu 16(a1=%xmm14 +movdqu 16(%rsi),%xmm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r0=reg128#4 +# asm 2: movdqa r0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: a0 = mem128[ input_1 + 0 ] +# asm 1: movdqu 0(a0=reg128#15 +# asm 2: movdqu 0(a0=%xmm14 +movdqu 0(%rsi),%xmm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm0,%xmm14,%xmm0 + +# qhasm: r0 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 16(r=%xmm0 +vpand 16(%rdx),%xmm14,%xmm0 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 32(r=%xmm0 +vpand 32(%rdx),%xmm14,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 48(r=%xmm0 +vpand 48(%rdx),%xmm14,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 64(r=%xmm0 +vpand 64(%rdx),%xmm14,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 80(r=%xmm0 +vpand 80(%rdx),%xmm14,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 96(r=%xmm0 +vpand 96(%rdx),%xmm14,%xmm0 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 112(r=%xmm0 +vpand 112(%rdx),%xmm14,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 128(r=%xmm0 +vpand 128(%rdx),%xmm14,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 144(r=%xmm0 +vpand 144(%rdx),%xmm14,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 160(r=%xmm0 +vpand 160(%rdx),%xmm14,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 176(r=%xmm0 +vpand 176(%rdx),%xmm14,%xmm0 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 192(r=%xmm0 +vpand 192(%rdx),%xmm14,%xmm0 + +# qhasm: r12 ^= r +# asm 1: pxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6960119_VEC_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6960119/vec/api.h b/crypto_kem/mceliece6960119/vec/api.h new file mode 100644 index 00000000..6d4191d1 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_API_H +#define PQCLEAN_MCELIECE6960119_VEC_API_H + +#include + +#define PQCLEAN_MCELIECE6960119_VEC_CRYPTO_ALGNAME "Classic McEliece 6960119" +#define PQCLEAN_MCELIECE6960119_VEC_CRYPTO_PUBLICKEYBYTES 1047319 +#define PQCLEAN_MCELIECE6960119_VEC_CRYPTO_SECRETKEYBYTES 13908 +#define PQCLEAN_MCELIECE6960119_VEC_CRYPTO_CIPHERTEXTBYTES 226 +#define PQCLEAN_MCELIECE6960119_VEC_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6960119_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6960119_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6960119_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/benes.c b/crypto_kem/mceliece6960119/vec/benes.c new file mode 100644 index 00000000..8425458a --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/benes.c @@ -0,0 +1,147 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" +#include "vec.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6960119_VEC_benes(vec *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = r[i * 2 + 0]; + r_int_v[1][i] = r[i * 2 + 1]; + } + + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + r[i * 2 + 0] = r_int_v[0][i]; + r[i * 2 + 1] = r_int_v[1][i]; + } +} + diff --git a/crypto_kem/mceliece6960119/vec/benes.h b/crypto_kem/mceliece6960119/vec/benes.h new file mode 100644 index 00000000..84b81a31 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/benes.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_BENES_H +#define PQCLEAN_MCELIECE6960119_VEC_BENES_H +/* + This file is for Benes network related functions +*/ + +#include + +void PQCLEAN_MCELIECE6960119_VEC_benes(uint64_t * /*r*/, const unsigned char * /*bits*/, int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/bm.c b/crypto_kem/mceliece6960119/vec/bm.c new file mode 100644 index 00000000..e6d0b79f --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/bm.c @@ -0,0 +1,239 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline void vec_cmov(vec *out, const vec *in, uint16_t mask) { + int i; + + vec m0, m1; + + m0 = PQCLEAN_MCELIECE6960119_VEC_vec_set1_16b(mask); + m1 = ~m0; + + for (i = 0; i < GFBITS; i++) { + out[i] = (in[i] & m0) | (out[i] & m1); + out[i] = (in[i] & m0) | (out[i] & m1); + } +} + +static inline void interleave(vec *in, int idx0, int idx1, const vec *mask, int b) { + int s = 1 << b; + + vec x, y; + + x = (in[idx0] & mask[0]) | ((in[idx1] & mask[0]) << s); + y = ((in[idx0] & mask[1]) >> s) | (in[idx1] & mask[1]); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, const vec *in) { + int i, k; + + vec mask[4][2]; + vec buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = 0; + } + + mask[0][0] = PQCLEAN_MCELIECE6960119_VEC_vec_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6960119_VEC_vec_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6960119_VEC_vec_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6960119_VEC_vec_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6960119_VEC_vec_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6960119_VEC_vec_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6960119_VEC_vec_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6960119_VEC_vec_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ k * 16 + i ] = (buf[i] >> (k * 16)) & GFMASK; + } + } +} + +static void update(vec in[][GFBITS], const gf e) { + int i; + vec tmp; + + for (i = 0; i < GFBITS; i++) { + tmp = (e >> i) & 1; + + in[0][i] = (in[0][i] >> 1) | (in[1][i] << 63); + in[1][i] = (in[1][i] >> 1) | (tmp << 63); + } +} + +static inline gf vec_reduce(vec in[][GFBITS]) { + int i; + vec tmp; + gf ret = 0; + + for (i = GFBITS - 1; i >= 0; i--) { + tmp = in[0][i] ^ in[1][i]; + + tmp ^= tmp >> 32; + tmp ^= tmp >> 16; + tmp ^= tmp >> 8; + tmp ^= tmp >> 4; + tmp ^= tmp >> 2; + tmp ^= tmp >> 1; + + ret <<= 1; + ret |= tmp & 1; + } + + return ret; +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +//void bm(vec out[][ GFBITS ], vec in[][ GFBITS ]) +void PQCLEAN_MCELIECE6960119_VEC_bm(vec out[][GFBITS], vec in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + + vec prod[2][GFBITS]; + vec interval[2][GFBITS]; + vec dd[2][GFBITS], bb[2][GFBITS]; + vec B[2][GFBITS], C[2][GFBITS]; + vec B_tmp[2][GFBITS], C_tmp[2][GFBITS]; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[ 64], in[1]); + get_coefs(&coefs[128], in[2]); + get_coefs(&coefs[192], in[3]); + + C[0][0] = 0; + C[1][0] = one << 63; + B[0][0] = 0; + B[1][0] = one << 62; + + for (i = 1; i < GFBITS; i++) { + C[0][i] = C[1][i] = B[0][i] = B[1][i] = 0; + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[0][i] = interval[1][i] = 0; + } + + for (N = 0; N < SYS_T * 2; N++) { + update(interval, coefs[N]); + + PQCLEAN_MCELIECE6960119_VEC_vec_mul(prod[0], C[0], interval[0]); + PQCLEAN_MCELIECE6960119_VEC_vec_mul(prod[1], C[1], interval[1]); + + d = vec_reduce(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[0][i] = dd[1][i] = PQCLEAN_MCELIECE6960119_VEC_vec_setbits((d >> i) & 1); + bb[0][i] = bb[1][i] = PQCLEAN_MCELIECE6960119_VEC_vec_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6960119_VEC_vec_mul(B_tmp[0], dd[0], B[0]); + PQCLEAN_MCELIECE6960119_VEC_vec_mul(B_tmp[1], dd[1], B[1]); + PQCLEAN_MCELIECE6960119_VEC_vec_mul(C_tmp[0], bb[0], C[0]); + PQCLEAN_MCELIECE6960119_VEC_vec_mul(C_tmp[1], bb[1], C[1]); + + vec_cmov(B[0], C[0], mask); + vec_cmov(B[1], C[1], mask); + update(B, 0); + + for (i = 0; i < GFBITS; i++) { + C[0][i] = B_tmp[0][i] ^ C_tmp[0][i]; + C[1][i] = B_tmp[1][i] ^ C_tmp[1][i]; + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + out[0][i] = (C[0][i] >> 8) | (C[1][i] << 56); + out[1][i] = C[1][i] >> 8; + } +} + diff --git a/crypto_kem/mceliece6960119/vec/bm.h b/crypto_kem/mceliece6960119/vec/bm.h new file mode 100644 index 00000000..b9e7f523 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/bm.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_BM_H +#define PQCLEAN_MCELIECE6960119_VEC_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE6960119_VEC_bm(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/consts.inc b/crypto_kem/mceliece6960119/vec/consts.inc new file mode 100644 index 00000000..1875ca4d --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/consts.inc @@ -0,0 +1,1920 @@ +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x9999999966666666, + 0x3C3CC3C3C3C33C3C, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xCC33CC3333CC33CC, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0x00FFFF0000FFFF00 +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x6666666699999999, + 0xC3C33C3C3C3CC3C3, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x33CC33CCCC33CC33, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0xFF0000FFFF0000FF +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x9669966969966996, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x6996699696699669, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x6996699696699669, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x9669966969966996, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, diff --git a/crypto_kem/mceliece6960119/vec/controlbits.c b/crypto_kem/mceliece6960119/vec/controlbits.c new file mode 100644 index 00000000..f2ee80b7 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6960119_VEC_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6960119_VEC_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6960119_VEC_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6960119_VEC_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6960119/vec/controlbits.h b/crypto_kem/mceliece6960119/vec/controlbits.h new file mode 100644 index 00000000..77c92a08 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_CONTROLBITS_H +#define PQCLEAN_MCELIECE6960119_VEC_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6960119_VEC_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6960119_VEC_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/crypto_hash.h b/crypto_kem/mceliece6960119/vec/crypto_hash.h new file mode 100644 index 00000000..323f1e25 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6960119_VEC_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6960119/vec/decrypt.c b/crypto_kem/mceliece6960119/vec/decrypt.c new file mode 100644 index 00000000..d5e9672c --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/decrypt.c @@ -0,0 +1,193 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" +#include "vec.h" + +#include + +static void scaling(vec out[][GFBITS], vec inv[][GFBITS], const unsigned char *sk, const vec *recv) { + int i, j; + + vec irr_int[2][ GFBITS ]; + vec eval[128][ GFBITS ]; + vec tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE6960119_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6960119_VEC_fft(eval, irr_int); + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE6960119_VEC_vec_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6960119_VEC_vec_copy(inv[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE6960119_VEC_vec_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119_VEC_vec_inv(tmp, inv[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE6960119_VEC_vec_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6960119_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119_VEC_vec_copy(inv[0], tmp); + + // + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static void preprocess(vec *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + r[i - 1] &= (1 << ((GFBITS * SYS_T) % 8)) - 1; // throwing away redundant bits + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 128; i++) { + recv[i] = PQCLEAN_MCELIECE6960119_VEC_load8(r + i * 8); + } +} + +static void postprocess(unsigned char *e, vec *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE6960119_VEC_store8(error8 + i * 8, err[i]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec out[][GFBITS], vec inv[][GFBITS], const vec *recv) { + int i, j; + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static uint16_t weight_check(const unsigned char *e, const vec *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < (1 << GFBITS); i++) { + w0 += (error[i / 64] >> (i % 64)) & 1; + } + + for (i = 0; i < SYS_N; i++) { + w1 += (e[i / 8] >> (i % 8)) & 1; + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec s0[][ GFBITS ], vec s1[][ GFBITS ]) { + int i, j; + vec diff = 0; + + for (i = 0; i < 4; i++) { + for (j = 0; j < GFBITS; j++) { + diff |= (s0[i][j] ^ s1[i][j]); + } + } + + return (uint16_t)PQCLEAN_MCELIECE6960119_VEC_vec_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6960119_VEC_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec inv[ 128 ][ GFBITS ]; + vec scaled[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + + vec error[ 128 ]; + + vec s_priv[ 4 ][ GFBITS ]; + vec s_priv_cmp[ 4 ][ GFBITS ]; + vec locator[2][ GFBITS ]; + + vec recv[ 128 ]; + vec allone; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE6960119_VEC_benes(recv, sk + IRR_BYTES, 1); + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE6960119_VEC_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6960119_VEC_bm(locator, s_priv); + + PQCLEAN_MCELIECE6960119_VEC_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6960119_VEC_vec_setbits(1); + + for (i = 0; i < 128; i++) { + error[i] = PQCLEAN_MCELIECE6960119_VEC_vec_or_reduce(eval[i]); + error[i] ^= allone; + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE6960119_VEC_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE6960119_VEC_benes(error, sk + IRR_BYTES, 0); + + postprocess(e, error); + + check_weight = weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece6960119/vec/decrypt.h b/crypto_kem/mceliece6960119/vec/decrypt.h new file mode 100644 index 00000000..699cc4ea --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_DECRYPT_H +#define PQCLEAN_MCELIECE6960119_VEC_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6960119_VEC_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/encrypt.c b/crypto_kem/mceliece6960119/vec/encrypt.c new file mode 100644 index 00000000..b1910acf --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/encrypt.c @@ -0,0 +1,152 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include + +#include "gf.h" + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind[ SYS_T * 2 ]; + uint8_t *ind8 = (uint8_t *)ind; + uint32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes(ind8, sizeof(ind)); + for (i = 0; i < sizeof(ind); i += 2) { + ind[i / 2] = (uint16_t)ind8[i + 1] << 8 | ind8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind32[i] == ind32[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6960119_VEC_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + unsigned char e_tmp[ SYS_N / 8 ]; + + uint64_t b; + + const uint8_t *pk_ptr8; + const uint8_t *e_ptr8 = e_tmp + SYND_BYTES - 1; + + int i, j, k, tail = (PK_NROWS % 8); + + // + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = e[i]; + } + + s[i - 1] &= (1 << tail) - 1; + + for (i = SYND_BYTES - 1; i < SYS_N / 8 - 1; i++) { + e_tmp[i] = (e[i] >> tail) | (e[i + 1] << (8 - tail)); + } + + e_tmp[i] = e[i] >> tail; + + for (i = 0; i < PK_NROWS; i++) { + pk_ptr8 = pk + PK_ROW_BYTES * i; + + b = 0; + for (j = 0; j < PK_NCOLS / 64; j++) { + b ^= PQCLEAN_MCELIECE6960119_VEC_load8(pk_ptr8 + j * 8) & PQCLEAN_MCELIECE6960119_VEC_load8(e_ptr8 + j * 8); + } + + for (k = 0; k < (PK_NCOLS % 64 + 7) / 8; k++) { + b ^= pk_ptr8[8 * j + k] & e_ptr8[8 * j + k]; + } + + b ^= b >> 32; + b ^= b >> 16; + b ^= b >> 8; + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] ^= (b << (i % 8)); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6960119_VEC_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece6960119/vec/encrypt.h b/crypto_kem/mceliece6960119/vec/encrypt.h new file mode 100644 index 00000000..17de3acd --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_ENCRYPT_H +#define PQCLEAN_MCELIECE6960119_VEC_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6960119_VEC_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/fft.c b/crypto_kem/mceliece6960119/vec/fft.c new file mode 100644 index 00000000..85229c89 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/fft.c @@ -0,0 +1,269 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec in[][GFBITS]) { + int i, j, k; + + const vec mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const vec s[5][2][GFBITS] = { +#include "scalars_2x.inc" + }; + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[1][i] >> 32; + in[0][i] ^= in[1][i] << 32; + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[0][i] ^= (in[0][i] & mask[k][0]) >> (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) >> (1 << k); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6960119_VEC_vec_mul(in[0], in[0], s[j][0]); + PQCLEAN_MCELIECE6960119_VEC_vec_mul(in[1], in[1], s[j][1]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[8][ GFBITS ]; + vec buf[128]; + + uint64_t consts_ptr = 2; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[7] = {2522, 7827, 7801, 8035, 6897, 8167, 3476}; + + // + + for (i = 0; i < 7; i++) { + for (j = 0; j < GFBITS; j++) { + pre[i][j] = (beta[i] >> j) & 1; + pre[i][j] = -pre[i][j]; + } + + PQCLEAN_MCELIECE6960119_VEC_vec_mul(pre[i], in[1], pre[i]); + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = in[0][i]; + + buf[1] = buf[0] ^ pre[0][i]; + buf[32] = in[0][i] ^ pre[5][i]; + buf[3] = buf[1] ^ pre[1][i]; + buf[96] = buf[32] ^ pre[6][i]; + buf[97] = buf[96] ^ pre[0][i]; + buf[2] = in[0][i] ^ pre[1][i]; + buf[99] = buf[97] ^ pre[1][i]; + buf[6] = buf[2] ^ pre[2][i]; + buf[98] = buf[99] ^ pre[0][i]; + buf[7] = buf[6] ^ pre[0][i]; + buf[102] = buf[98] ^ pre[2][i]; + buf[5] = buf[7] ^ pre[1][i]; + buf[103] = buf[102] ^ pre[0][i]; + buf[101] = buf[103] ^ pre[1][i]; + buf[4] = in[0][i] ^ pre[2][i]; + buf[100] = buf[101] ^ pre[0][i]; + buf[12] = buf[4] ^ pre[3][i]; + buf[108] = buf[100] ^ pre[3][i]; + buf[13] = buf[12] ^ pre[0][i]; + buf[109] = buf[108] ^ pre[0][i]; + buf[15] = buf[13] ^ pre[1][i]; + buf[111] = buf[109] ^ pre[1][i]; + buf[14] = buf[15] ^ pre[0][i]; + buf[110] = buf[111] ^ pre[0][i]; + buf[10] = buf[14] ^ pre[2][i]; + buf[106] = buf[110] ^ pre[2][i]; + buf[11] = buf[10] ^ pre[0][i]; + buf[107] = buf[106] ^ pre[0][i]; + buf[9] = buf[11] ^ pre[1][i]; + buf[105] = buf[107] ^ pre[1][i]; + buf[104] = buf[105] ^ pre[0][i]; + buf[8] = in[0][i] ^ pre[3][i]; + buf[120] = buf[104] ^ pre[4][i]; + buf[24] = buf[8] ^ pre[4][i]; + buf[121] = buf[120] ^ pre[0][i]; + buf[25] = buf[24] ^ pre[0][i]; + buf[123] = buf[121] ^ pre[1][i]; + buf[27] = buf[25] ^ pre[1][i]; + buf[122] = buf[123] ^ pre[0][i]; + buf[26] = buf[27] ^ pre[0][i]; + buf[126] = buf[122] ^ pre[2][i]; + buf[30] = buf[26] ^ pre[2][i]; + buf[127] = buf[126] ^ pre[0][i]; + buf[31] = buf[30] ^ pre[0][i]; + buf[125] = buf[127] ^ pre[1][i]; + buf[29] = buf[31] ^ pre[1][i]; + buf[124] = buf[125] ^ pre[0][i]; + buf[28] = buf[29] ^ pre[0][i]; + buf[116] = buf[124] ^ pre[3][i]; + buf[20] = buf[28] ^ pre[3][i]; + buf[117] = buf[116] ^ pre[0][i]; + buf[21] = buf[20] ^ pre[0][i]; + buf[119] = buf[117] ^ pre[1][i]; + buf[23] = buf[21] ^ pre[1][i]; + buf[118] = buf[119] ^ pre[0][i]; + buf[22] = buf[23] ^ pre[0][i]; + buf[114] = buf[118] ^ pre[2][i]; + buf[18] = buf[22] ^ pre[2][i]; + buf[115] = buf[114] ^ pre[0][i]; + buf[19] = buf[18] ^ pre[0][i]; + buf[113] = buf[115] ^ pre[1][i]; + buf[17] = buf[19] ^ pre[1][i]; + buf[112] = buf[113] ^ pre[0][i]; + buf[80] = buf[112] ^ pre[5][i]; + buf[16] = in[0][i] ^ pre[4][i]; + buf[81] = buf[80] ^ pre[0][i]; + buf[48] = buf[16] ^ pre[5][i]; + buf[83] = buf[81] ^ pre[1][i]; + buf[49] = buf[48] ^ pre[0][i]; + buf[82] = buf[83] ^ pre[0][i]; + buf[51] = buf[49] ^ pre[1][i]; + buf[86] = buf[82] ^ pre[2][i]; + buf[50] = buf[51] ^ pre[0][i]; + buf[87] = buf[86] ^ pre[0][i]; + buf[54] = buf[50] ^ pre[2][i]; + buf[85] = buf[87] ^ pre[1][i]; + buf[55] = buf[54] ^ pre[0][i]; + buf[84] = buf[85] ^ pre[0][i]; + buf[53] = buf[55] ^ pre[1][i]; + buf[92] = buf[84] ^ pre[3][i]; + buf[52] = buf[53] ^ pre[0][i]; + buf[93] = buf[92] ^ pre[0][i]; + buf[60] = buf[52] ^ pre[3][i]; + buf[95] = buf[93] ^ pre[1][i]; + buf[61] = buf[60] ^ pre[0][i]; + buf[94] = buf[95] ^ pre[0][i]; + buf[63] = buf[61] ^ pre[1][i]; + buf[90] = buf[94] ^ pre[2][i]; + buf[62] = buf[63] ^ pre[0][i]; + buf[91] = buf[90] ^ pre[0][i]; + buf[58] = buf[62] ^ pre[2][i]; + buf[89] = buf[91] ^ pre[1][i]; + buf[59] = buf[58] ^ pre[0][i]; + buf[88] = buf[89] ^ pre[0][i]; + buf[57] = buf[59] ^ pre[1][i]; + buf[72] = buf[88] ^ pre[4][i]; + buf[56] = buf[57] ^ pre[0][i]; + buf[73] = buf[72] ^ pre[0][i]; + buf[40] = buf[56] ^ pre[4][i]; + buf[75] = buf[73] ^ pre[1][i]; + buf[41] = buf[40] ^ pre[0][i]; + buf[74] = buf[75] ^ pre[0][i]; + buf[43] = buf[41] ^ pre[1][i]; + buf[78] = buf[74] ^ pre[2][i]; + buf[42] = buf[43] ^ pre[0][i]; + buf[79] = buf[78] ^ pre[0][i]; + buf[46] = buf[42] ^ pre[2][i]; + buf[77] = buf[79] ^ pre[1][i]; + buf[47] = buf[46] ^ pre[0][i]; + buf[76] = buf[77] ^ pre[0][i]; + buf[45] = buf[47] ^ pre[1][i]; + buf[68] = buf[76] ^ pre[3][i]; + buf[44] = buf[45] ^ pre[0][i]; + buf[69] = buf[68] ^ pre[0][i]; + buf[36] = buf[44] ^ pre[3][i]; + buf[71] = buf[69] ^ pre[1][i]; + buf[37] = buf[36] ^ pre[0][i]; + buf[70] = buf[71] ^ pre[0][i]; + buf[39] = buf[37] ^ pre[1][i]; + buf[66] = buf[70] ^ pre[2][i]; + buf[38] = buf[39] ^ pre[0][i]; + buf[67] = buf[66] ^ pre[0][i]; + buf[34] = buf[38] ^ pre[2][i]; + buf[65] = buf[67] ^ pre[1][i]; + buf[35] = buf[34] ^ pre[0][i]; + buf[33] = buf[35] ^ pre[1][i]; + buf[64] = in[0][i] ^ pre[6][i]; + + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(buf + 0, buf + 0); + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(buf + 64, buf + 64); + + for (j = 0; j < 128; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 1; i <= 6; i++) { + s = 1 << i; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6960119_VEC_vec_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += ((uint64_t)1 << i); + } + + // adding the part contributed by x^128 + +// for (i = 0; i < 128; i++) +// for (b = 0; b < GFBITS; b++) +// out[i][b] ^= powers[i][b]; +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6960119_VEC_fft(vec out[][GFBITS], vec in[][GFBITS]) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece6960119/vec/fft.h b/crypto_kem/mceliece6960119/vec/fft.h new file mode 100644 index 00000000..867c9ed8 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_FFT_H +#define PQCLEAN_MCELIECE6960119_VEC_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE6960119_VEC_fft(vec out[][ GFBITS ], vec in[][GFBITS]); + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/fft_tr.c b/crypto_kem/mceliece6960119/vec/fft_tr.c new file mode 100644 index 00000000..e609c32e --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/fft_tr.c @@ -0,0 +1,300 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec in[][ GFBITS ]) { + int i, j, k; + + const vec mask[6][2] = { + {0x2222222222222222, 0x4444444444444444}, + {0x0C0C0C0C0C0C0C0C, 0x3030303030303030}, + {0x00F000F000F000F0, 0x0F000F000F000F00}, + {0x0000FF000000FF00, 0x00FF000000FF0000}, + {0x00000000FFFF0000, 0x0000FFFF00000000}, + {0xFFFFFFFF00000000, 0x00000000FFFFFFFF} + }; + + const vec s[6][4][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6960119_VEC_vec_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE6960119_VEC_vec_mul(in[1], in[1], s[j][1]); // scaling + PQCLEAN_MCELIECE6960119_VEC_vec_mul(in[2], in[2], s[j][2]); // scaling + PQCLEAN_MCELIECE6960119_VEC_vec_mul(in[3], in[3], s[j][3]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + in[0][i] ^= (in[0][i] & mask[k][0]) << (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][0]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][1]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][0]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][1]) << (1 << k); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[0][i] >> 32; + in[1][i] ^= in[1][i] << 32; + + in[3][i] ^= in[2][i] >> 32; + in[3][i] ^= in[3][i] << 32; + } + } + + for (i = 0; i < GFBITS; i++) { + in[3][i] ^= in[2][i] ^= in[1][i]; + } + } +} + +static void butterflies_tr(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[6][2][ GFBITS ]; + vec buf[2][64]; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 128; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 6; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + + PQCLEAN_MCELIECE6960119_VEC_vec_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tmp[b]; + } + } + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 128; k++) { + (&buf[0][0])[ k ] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(buf[0], buf[0]); + PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(buf[1], buf[1]); + + for (k = 0; k < 2; k++) { + pre[0][k][i] = buf[k][32]; + buf[k][33] ^= buf[k][32]; + pre[1][k][i] = buf[k][33]; + buf[k][35] ^= buf[k][33]; + pre[0][k][i] ^= buf[k][35]; + buf[k][34] ^= buf[k][35]; + pre[2][k][i] = buf[k][34]; + buf[k][38] ^= buf[k][34]; + pre[0][k][i] ^= buf[k][38]; + buf[k][39] ^= buf[k][38]; + pre[1][k][i] ^= buf[k][39]; + buf[k][37] ^= buf[k][39]; + pre[0][k][i] ^= buf[k][37]; + buf[k][36] ^= buf[k][37]; + pre[3][k][i] = buf[k][36]; + buf[k][44] ^= buf[k][36]; + pre[0][k][i] ^= buf[k][44]; + buf[k][45] ^= buf[k][44]; + pre[1][k][i] ^= buf[k][45]; + buf[k][47] ^= buf[k][45]; + pre[0][k][i] ^= buf[k][47]; + buf[k][46] ^= buf[k][47]; + pre[2][k][i] ^= buf[k][46]; + buf[k][42] ^= buf[k][46]; + pre[0][k][i] ^= buf[k][42]; + buf[k][43] ^= buf[k][42]; + pre[1][k][i] ^= buf[k][43]; + buf[k][41] ^= buf[k][43]; + pre[0][k][i] ^= buf[k][41]; + buf[k][40] ^= buf[k][41]; + pre[4][k][i] = buf[k][40]; + buf[k][56] ^= buf[k][40]; + pre[0][k][i] ^= buf[k][56]; + buf[k][57] ^= buf[k][56]; + pre[1][k][i] ^= buf[k][57]; + buf[k][59] ^= buf[k][57]; + pre[0][k][i] ^= buf[k][59]; + buf[k][58] ^= buf[k][59]; + pre[2][k][i] ^= buf[k][58]; + buf[k][62] ^= buf[k][58]; + pre[0][k][i] ^= buf[k][62]; + buf[k][63] ^= buf[k][62]; + pre[1][k][i] ^= buf[k][63]; + buf[k][61] ^= buf[k][63]; + pre[0][k][i] ^= buf[k][61]; + buf[k][60] ^= buf[k][61]; + pre[3][k][i] ^= buf[k][60]; + buf[k][52] ^= buf[k][60]; + pre[0][k][i] ^= buf[k][52]; + buf[k][53] ^= buf[k][52]; + pre[1][k][i] ^= buf[k][53]; + buf[k][55] ^= buf[k][53]; + pre[0][k][i] ^= buf[k][55]; + buf[k][54] ^= buf[k][55]; + pre[2][k][i] ^= buf[k][54]; + buf[k][50] ^= buf[k][54]; + pre[0][k][i] ^= buf[k][50]; + buf[k][51] ^= buf[k][50]; + pre[1][k][i] ^= buf[k][51]; + buf[k][49] ^= buf[k][51]; + pre[0][k][i] ^= buf[k][49]; + buf[k][48] ^= buf[k][49]; + pre[5][k][i] = buf[k][48]; + buf[k][16] ^= buf[k][48]; + pre[0][k][i] ^= buf[k][16]; + buf[k][17] ^= buf[k][16]; + pre[1][k][i] ^= buf[k][17]; + buf[k][19] ^= buf[k][17]; + pre[0][k][i] ^= buf[k][19]; + buf[k][18] ^= buf[k][19]; + pre[2][k][i] ^= buf[k][18]; + buf[k][22] ^= buf[k][18]; + pre[0][k][i] ^= buf[k][22]; + buf[k][23] ^= buf[k][22]; + pre[1][k][i] ^= buf[k][23]; + buf[k][21] ^= buf[k][23]; + pre[0][k][i] ^= buf[k][21]; + buf[k][20] ^= buf[k][21]; + pre[3][k][i] ^= buf[k][20]; + buf[k][28] ^= buf[k][20]; + pre[0][k][i] ^= buf[k][28]; + buf[k][29] ^= buf[k][28]; + pre[1][k][i] ^= buf[k][29]; + buf[k][31] ^= buf[k][29]; + pre[0][k][i] ^= buf[k][31]; + buf[k][30] ^= buf[k][31]; + pre[2][k][i] ^= buf[k][30]; + buf[k][26] ^= buf[k][30]; + pre[0][k][i] ^= buf[k][26]; + buf[k][27] ^= buf[k][26]; + pre[1][k][i] ^= buf[k][27]; + buf[k][25] ^= buf[k][27]; + pre[0][k][i] ^= buf[k][25]; + buf[k][24] ^= buf[k][25]; + pre[4][k][i] ^= buf[k][24]; + buf[k][8] ^= buf[k][24]; + pre[0][k][i] ^= buf[k][8]; + buf[k][9] ^= buf[k][8]; + pre[1][k][i] ^= buf[k][9]; + buf[k][11] ^= buf[k][9]; + pre[0][k][i] ^= buf[k][11]; + buf[k][10] ^= buf[k][11]; + pre[2][k][i] ^= buf[k][10]; + buf[k][14] ^= buf[k][10]; + pre[0][k][i] ^= buf[k][14]; + buf[k][15] ^= buf[k][14]; + pre[1][k][i] ^= buf[k][15]; + buf[k][13] ^= buf[k][15]; + pre[0][k][i] ^= buf[k][13]; + buf[k][12] ^= buf[k][13]; + pre[3][k][i] ^= buf[k][12]; + buf[k][4] ^= buf[k][12]; + pre[0][k][i] ^= buf[k][4]; + buf[k][5] ^= buf[k][4]; + pre[1][k][i] ^= buf[k][5]; + buf[k][7] ^= buf[k][5]; + pre[0][k][i] ^= buf[k][7]; + buf[k][6] ^= buf[k][7]; + pre[2][k][i] ^= buf[k][6]; + buf[k][2] ^= buf[k][6]; + pre[0][k][i] ^= buf[k][2]; + buf[k][3] ^= buf[k][2]; + pre[1][k][i] ^= buf[k][3]; + buf[k][1] ^= buf[k][3]; + + pre[0][k][i] ^= buf[k][1]; + out[k][i] = buf[k][0] ^ buf[k][1]; + } + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119_VEC_vec_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119_VEC_vec_mul(out[2], pre[0][0], tmp); + PQCLEAN_MCELIECE6960119_VEC_vec_mul(out[3], pre[0][1], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119_VEC_vec_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119_VEC_vec_mul(pre[i][0], pre[i][0], tmp); + PQCLEAN_MCELIECE6960119_VEC_vec_mul(pre[i][1], pre[i][1], tmp); + + for (b = 0; b < GFBITS; b++) { + out[2][b] ^= pre[i][0][b]; + out[3][b] ^= pre[i][1][b]; + } + } + +} + +/* justifying the length of the output */ +static void postprocess(vec out[4][GFBITS]) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[3][i] <<= (128 - SYS_T) * 2; + out[3][i] >>= (128 - SYS_T) * 2; + } +} + +void PQCLEAN_MCELIECE6960119_VEC_fft_tr(vec out[][GFBITS], vec in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/crypto_kem/mceliece6960119/vec/fft_tr.h b/crypto_kem/mceliece6960119/vec/fft_tr.h new file mode 100644 index 00000000..64e1d4bc --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_FFT_TR_H +#define PQCLEAN_MCELIECE6960119_VEC_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE6960119_VEC_fft_tr(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/gf.c b/crypto_kem/mceliece6960119/vec/gf.c new file mode 100644 index 00000000..b3215137 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/gf.c @@ -0,0 +1,203 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6960119_VEC_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE6960119_VEC_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6960119_VEC_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6960119_VEC_gf_inv(gf in) { + return PQCLEAN_MCELIECE6960119_VEC_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6960119_VEC_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[237]; + + for (i = 0; i < 237; i++) { + prod[i] = 0; + } + + for (i = 0; i < 119; i++) { + for (j = 0; j < 119; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6960119_VEC_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 236; i >= 119; i--) { + prod[i - 117] ^= PQCLEAN_MCELIECE6960119_VEC_gf_mul(prod[i], (gf) 6400); + prod[i - 119] ^= PQCLEAN_MCELIECE6960119_VEC_gf_mul(prod[i], (gf) 3134); + } + + for (i = 0; i < 119; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6960119/vec/gf.h b/crypto_kem/mceliece6960119/vec/gf.h new file mode 100644 index 00000000..b81f13e0 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/gf.h @@ -0,0 +1,20 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_GF_H +#define PQCLEAN_MCELIECE6960119_VEC_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6960119_VEC_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE6960119_VEC_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE6960119_VEC_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE6960119_VEC_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE6960119_VEC_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/operations.c b/crypto_kem/mceliece6960119/vec/operations.c new file mode 100644 index 00000000..4208bfb2 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6960119_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6960119_VEC_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6960119_VEC_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6960119_VEC_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6960119_VEC_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6960119_VEC_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6960119_VEC_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6960119_VEC_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6960119_VEC_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6960119_VEC_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6960119_VEC_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119/vec/params.h b/crypto_kem/mceliece6960119/vec/params.h new file mode 100644 index 00000000..500131b3 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_PARAMS_H +#define PQCLEAN_MCELIECE6960119_VEC_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6960 +#define SYS_T 119 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/pk_gen.c b/crypto_kem/mceliece6960119/vec/pk_gen.c new file mode 100644 index 00000000..e8f639a2 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/pk_gen.c @@ -0,0 +1,251 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec in[][GFBITS]) { + int i, j, r; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 0; r < 64; r++) { + out[i * 64 + r] <<= 1; + out[i * 64 + r] |= (in[i][j] >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec out0[][GFBITS], vec out1[][GFBITS], const uint64_t *in) { + int i, j, r; + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out1[i][j] <<= 1; + out1[i][j] |= (in[i * 64 + r] >> (j + GFBITS)) & 1; + } + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out0[i][GFBITS - 1 - j] <<= 1; + out0[i][GFBITS - 1 - j] |= (in[i * 64 + r] >> j) & 1; + } + } + } +} + +int PQCLEAN_MCELIECE6960119_VEC_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { +#define NBLOCKS_H ((SYS_N + 63) / 64) +#define NBLOCKS_I ((GFBITS * SYS_T + 63) / 64) + const int block_idx = NBLOCKS_I - 1; + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS_H ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS_I ]; + + uint64_t mask; + + vec irr_int[2][ GFBITS ]; + + vec consts[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + vec prod[ 128 ][ GFBITS ]; + vec tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE6960119_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6960119_VEC_fft(eval, irr_int); + + PQCLEAN_MCELIECE6960119_VEC_vec_copy(prod[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE6960119_VEC_vec_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119_VEC_vec_inv(tmp, prod[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE6960119_VEC_vec_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6960119_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119_VEC_vec_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6960119_VEC_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS_I; j++) { + PQCLEAN_MCELIECE6960119_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + uint64_t column[ PK_NROWS ]; + + for (i = 0; i < PK_NROWS; i++) { + column[i] = mat[ i ][ block_idx ]; + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS_I; j < NBLOCKS_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS_I; j < NBLOCKS_H; j++) { + PQCLEAN_MCELIECE6960119_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + for (i = 0; i < PK_NROWS; i++) { + mat[ i ][ block_idx ] = column[i]; + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = 0; k < NBLOCKS_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS_H - 1; k++) { + one_row[k] = (one_row[k] >> tail) | (one_row[k + 1] << (64 - tail)); + PQCLEAN_MCELIECE6960119_VEC_store8(pk, one_row[k]); + pk += 8; + } + + one_row[k] >>= tail; + PQCLEAN_MCELIECE6960119_VEC_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk[ (PK_ROW_BYTES % 8) - 1 ] &= (1 << (PK_NCOLS % 8)) - 1; // removing redundant bits + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece6960119/vec/pk_gen.h b/crypto_kem/mceliece6960119/vec/pk_gen.h new file mode 100644 index 00000000..53607165 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_PK_GEN_H +#define PQCLEAN_MCELIECE6960119_VEC_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6960119_VEC_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/scalars_2x.inc b/crypto_kem/mceliece6960119/vec/scalars_2x.inc new file mode 100644 index 00000000..a0abb162 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/scalars_2x.inc @@ -0,0 +1,150 @@ +{{ + 0X3C3CF30C0000C003, + 0X0CCCC3F333C0000C, + 0X03C33F33FCC0C03C, + 0X0003000F3C03C0C0, + 0XF33FF33030CF03F0, + 0X0CF0303300F0CCC0, + 0XFF3F0C0CC0FF3CC0, + 0XCF3CF0FF003FC000, + 0XC00FF3CF0303F300, + 0X3CCC0CC00CF0CC00, + 0XF30FFC3C3FCCFC00, + 0X3F0FC3F0CCF0C000, + 0X3000FF33CCF0F000 +}, +{ + 0X0C0F0FCF0F0CF330, + 0XF0000FC33C3CCF3C, + 0X3C0F3F00C3C300FC, + 0X3C33CCC0F0F3CC30, + 0XC0CFFFFFCCCC30CC, + 0X3FC3F3CCFFFC033F, + 0XFC3030CCCCC0CFCF, + 0X0FCF0C00CCF333C3, + 0XCFFCF33000CFF030, + 0X00CFFCC330F30FCC, + 0X3CCC3FCCC0F3FFF3, + 0XF00F0C3FC003C0FF, + 0X330CCFCC03C0FC33 +}}, +{{ + 0X0F0F0FF0F000000F, + 0X00FFFFFFFF0000F0, + 0XFFFF00FF00000F00, + 0XFFF000F00F0FF000, + 0XFFF0000F0FF000F0, + 0X00FF000FFF000000, + 0XFF0F0FFF0F0FF000, + 0X0FFF0000000F0000, + 0X00F000F0FFF00F00, + 0X00F00FF00F00F000, + 0XFFF000F000F00000, + 0X00F00F000FF00000, + 0X0000FF0F0000F000 +}, +{ + 0XF0FFFFFFF0F00F00, + 0X00FFF0FFFF0000FF, + 0X00FF00000F0F0FFF, + 0XF000F0000F00FF0F, + 0XFF000000FFF00000, + 0XF0FF000FF00F0FF0, + 0X0F0F0F00FF000F0F, + 0X0F0F00F0F0F0F000, + 0X00F00F00F00F000F, + 0X00F0F0F00000FFF0, + 0XFFFFFF0FF00F0FFF, + 0X0F0FFFF00FFFFFFF, + 0XFFFF0F0FFF0FFF00 +}}, +{{ + 0X00FF0000000000FF, + 0XFFFFFFFFFF00FF00, + 0XFF0000FF00FF0000, + 0XFFFF000000FF0000, + 0XFF00000000FF0000, + 0X00FFFFFFFF000000, + 0XFF0000FFFFFF0000, + 0XFF00FF00FFFF0000, + 0X00FFFFFFFF00FF00, + 0XFFFF000000000000, + 0X00FF0000FF000000, + 0XFF00FF00FF000000, + 0X00FF00FFFF000000 +}, +{ + 0X00FF00FF00FF0000, + 0XFF00FFFF000000FF, + 0X0000FFFF000000FF, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF00FF, + 0X0000FFFF00FFFF00, + 0XFF00FF0000FFFF00, + 0X00000000FFFFFFFF, + 0X0000FF0000000000, + 0XFF00FFFF00FFFF00, + 0X00FFFF00000000FF, + 0X0000FF00FF00FFFF, + 0XFF0000FFFFFF0000 +}}, +{{ + 0X000000000000FFFF, + 0XFFFFFFFFFFFF0000, + 0X0000000000000000, + 0XFFFF0000FFFF0000, + 0XFFFFFFFFFFFF0000, + 0X0000FFFF00000000, + 0X0000FFFFFFFF0000, + 0XFFFF0000FFFF0000, + 0X0000FFFF00000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000FFFF00000000, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0X0000FFFF00000000, + 0XFFFF0000FFFF0000, + 0X0000FFFFFFFF0000, + 0X0000FFFF0000FFFF, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFF0000, + 0XFFFF0000FFFFFFFF, + 0XFFFF0000FFFFFFFF, + 0X0000000000000000 +}}, +{{ + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000 +}} diff --git a/crypto_kem/mceliece6960119/vec/scalars_4x.inc b/crypto_kem/mceliece6960119/vec/scalars_4x.inc new file mode 100644 index 00000000..cbaccec7 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/scalars_4x.inc @@ -0,0 +1,360 @@ +{{ + 0x3C3CF30C0000C003, + 0x0CCCC3F333C0000C, + 0x03C33F33FCC0C03C, + 0x0003000F3C03C0C0, + 0xF33FF33030CF03F0, + 0x0CF0303300F0CCC0, + 0xFF3F0C0CC0FF3CC0, + 0xCF3CF0FF003FC000, + 0xC00FF3CF0303F300, + 0x3CCC0CC00CF0CC00, + 0xF30FFC3C3FCCFC00, + 0x3F0FC3F0CCF0C000, + 0x3000FF33CCF0F000 +}, +{ + 0x0C0F0FCF0F0CF330, + 0xF0000FC33C3CCF3C, + 0x3C0F3F00C3C300FC, + 0x3C33CCC0F0F3CC30, + 0xC0CFFFFFCCCC30CC, + 0x3FC3F3CCFFFC033F, + 0xFC3030CCCCC0CFCF, + 0x0FCF0C00CCF333C3, + 0xCFFCF33000CFF030, + 0x00CFFCC330F30FCC, + 0x3CCC3FCCC0F3FFF3, + 0xF00F0C3FC003C0FF, + 0x330CCFCC03C0FC33 +}, +{ + 0xF0F30C33CF03F03F, + 0x00F30FC00C3300FF, + 0xF3CC3CF3F3FCF33F, + 0x3C0FC0FC303C3F3C, + 0xFC30CF303F3FF00F, + 0x33300C0CC3300CF3, + 0x3C030CF3F03FF3F3, + 0x3CCC03FCCC3FFC03, + 0x033C3C3CF0003FC3, + 0xFFC0FF00F0FF0F03, + 0xF3F30CF003FCC303, + 0x30CFCFC3CC0F3000, + 0x0CF30CCF3FCFCC0F +}, +{ + 0x3F30CC0C000F3FCC, + 0xFC3CF030FC3FFF03, + 0x33FFFCFF0CCF3CC3, + 0x003CFF33C3CC30CF, + 0xCFF3CF33C00F3003, + 0x00F3CC0CF3003CCF, + 0x3C000CFCCC3C3333, + 0xF3CF03C0FCF03FF0, + 0x3F3C3CF0C330330C, + 0x33CCFCC0FF0033F0, + 0x33C300C0F0C003F3, + 0x003FF0003F00C00C, + 0xCFF3C3033F030FFF +}}, +{{ + 0x0F0F0FF0F000000F, + 0x00FFFFFFFF0000F0, + 0xFFFF00FF00000F00, + 0xFFF000F00F0FF000, + 0xFFF0000F0FF000F0, + 0x00FF000FFF000000, + 0xFF0F0FFF0F0FF000, + 0x0FFF0000000F0000, + 0x00F000F0FFF00F00, + 0x00F00FF00F00F000, + 0xFFF000F000F00000, + 0x00F00F000FF00000, + 0x0000FF0F0000F000 +}, +{ + 0xF0FFFFFFF0F00F00, + 0x00FFF0FFFF0000FF, + 0x00FF00000F0F0FFF, + 0xF000F0000F00FF0F, + 0xFF000000FFF00000, + 0xF0FF000FF00F0FF0, + 0x0F0F0F00FF000F0F, + 0x0F0F00F0F0F0F000, + 0x00F00F00F00F000F, + 0x00F0F0F00000FFF0, + 0xFFFFFF0FF00F0FFF, + 0x0F0FFFF00FFFFFFF, + 0xFFFF0F0FFF0FFF00 +}, +{ + 0x0F0F00FF0FF0FFFF, + 0xF000F0F00F00FF0F, + 0x000FFFF0FFF0FF0F, + 0x00F00FFF00000FF0, + 0xFFFFF0000FFFF00F, + 0xFFF0FFF0000FFFF0, + 0xF0F0F0000F0F0F00, + 0x00F000F0F00FFF00, + 0xF0FF0F0FFF00F0FF, + 0xF0FF0FFFF0F0F0FF, + 0x00FFFFFFFFFFFFF0, + 0x00FFF0F0FF000F0F, + 0x000FFFF0000FFF00 +}, +{ + 0xFF0F0F00F000F0FF, + 0x0FFFFFFFFF00000F, + 0xF0FFFF000F00F0FF, + 0x0F0000F00FFF0FFF, + 0x0F0F0F00FF0F000F, + 0x000F0F0FFFF0F000, + 0xF0FFFF0F00F0FF0F, + 0x0F0F000F0F00F0FF, + 0x0000F0FF00FF0F0F, + 0x00FFFF0FF0FFF0F0, + 0x0000000F00F0FFF0, + 0xF0F00000FF00F0F0, + 0x0F0F0FFFFFFFFFFF +}}, +{{ + 0x00FF0000000000FF, + 0xFFFFFFFFFF00FF00, + 0xFF0000FF00FF0000, + 0xFFFF000000FF0000, + 0xFF00000000FF0000, + 0x00FFFFFFFF000000, + 0xFF0000FFFFFF0000, + 0xFF00FF00FFFF0000, + 0x00FFFFFFFF00FF00, + 0xFFFF000000000000, + 0x00FF0000FF000000, + 0xFF00FF00FF000000, + 0x00FF00FFFF000000 +}, +{ + 0x00FF00FF00FF0000, + 0xFF00FFFF000000FF, + 0x0000FFFF000000FF, + 0x00FFFF00FF000000, + 0xFFFFFF0000FF00FF, + 0x0000FFFF00FFFF00, + 0xFF00FF0000FFFF00, + 0x00000000FFFFFFFF, + 0x0000FF0000000000, + 0xFF00FFFF00FFFF00, + 0x00FFFF00000000FF, + 0x0000FF00FF00FFFF, + 0xFF0000FFFFFF0000 +}, +{ + 0xFFFF00FF00FF00FF, + 0x00FFFF000000FF00, + 0xFFFF00FFFFFFFF00, + 0x0000FFFF00FFFFFF, + 0x00FF0000FF0000FF, + 0xFFFF0000FF00FFFF, + 0xFF000000FFFFFF00, + 0x000000000000FFFF, + 0xFF00FF00FFFF0000, + 0xFFFF00FFFF00FFFF, + 0xFFFFFFFFFF00FF00, + 0xFFFF00FFFF0000FF, + 0x0000FF00000000FF +}, +{ + 0xFF0000FFFFFF00FF, + 0xFFFF0000FFFFFFFF, + 0xFFFF000000FFFFFF, + 0x00FFFF00FF0000FF, + 0xFFFFFF00FFFFFF00, + 0x00FFFF00FFFF00FF, + 0x0000FFFF00FF0000, + 0x000000FFFF000000, + 0xFF00FF0000FF00FF, + 0x00FF0000000000FF, + 0xFF00FFFF00FF00FF, + 0xFFFFFFFFFFFFFFFF, + 0x0000FF000000FFFF +}}, +{{ + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0x0000000000000000, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFF0000, + 0x0000FFFF00000000, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFF0000, + 0x0000FFFF00000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000FFFF00000000, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000FFFF00000000, + 0xFFFF0000FFFF0000, + 0x0000FFFFFFFF0000, + 0x0000FFFF0000FFFF, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFF0000, + 0xFFFF0000FFFFFFFF, + 0xFFFF0000FFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF000000000000, + 0x0000FFFF00000000, + 0x00000000FFFF0000, + 0x0000FFFFFFFFFFFF, + 0x0000FFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0x000000000000FFFF, + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0xFFFFFFFF0000FFFF, + 0xFFFF0000FFFFFFFF +}, +{ + 0x0000FFFFFFFFFFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFFFFFF, + 0x00000000FFFF0000, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFF00000000, + 0xFFFFFFFF00000000, + 0x0000FFFFFFFF0000, + 0x0000FFFFFFFFFFFF +}}, +{{ + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000 +}, +{ + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000 +}}, +{{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF +}}, diff --git a/crypto_kem/mceliece6960119/vec/sk_gen.c b/crypto_kem/mceliece6960119/vec/sk_gen.c new file mode 100644 index 00000000..fe0696cf --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6960119_VEC_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6960119_VEC_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6960119_VEC_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6960119_VEC_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6960119_VEC_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6960119_VEC_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6960119_VEC_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6960119_VEC_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119/vec/sk_gen.h b/crypto_kem/mceliece6960119/vec/sk_gen.h new file mode 100644 index 00000000..557e3343 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_SK_GEN_H +#define PQCLEAN_MCELIECE6960119_VEC_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6960119_VEC_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6960119_VEC_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/transpose.c b/crypto_kem/mceliece6960119/vec/transpose.c new file mode 100644 index 00000000..bea35a5f --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/transpose.c @@ -0,0 +1,35 @@ +#include "transpose.h" + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} diff --git a/crypto_kem/mceliece6960119/vec/transpose.h b/crypto_kem/mceliece6960119/vec/transpose.h new file mode 100644 index 00000000..9d7079c6 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/transpose.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_TRANSPOSE_H +#define PQCLEAN_MCELIECE6960119_VEC_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + + +void PQCLEAN_MCELIECE6960119_VEC_transpose_64x64(uint64_t *out, const uint64_t *in); + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/util.c b/crypto_kem/mceliece6960119/vec/util.c new file mode 100644 index 00000000..41b277a0 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/util.c @@ -0,0 +1,97 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ +#include "util.h" + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE6960119_VEC_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6960119_VEC_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6960119_VEC_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6960119_VEC_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6960119_VEC_irr_load(vec out[][GFBITS], const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6960119_VEC_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[0][i] = v[0]; + out[1][i] = v[1]; + } +} + +void PQCLEAN_MCELIECE6960119_VEC_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6960119_VEC_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} diff --git a/crypto_kem/mceliece6960119/vec/util.h b/crypto_kem/mceliece6960119/vec/util.h new file mode 100644 index 00000000..09f5a21a --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/util.h @@ -0,0 +1,27 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_UTIL_H +#define PQCLEAN_MCELIECE6960119_VEC_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE6960119_VEC_store_i(unsigned char *out, uint64_t in, int i); + +void PQCLEAN_MCELIECE6960119_VEC_store2(unsigned char *dest, uint16_t a); + +uint16_t PQCLEAN_MCELIECE6960119_VEC_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE6960119_VEC_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE6960119_VEC_irr_load(vec out[][GFBITS], const unsigned char *in); + +void PQCLEAN_MCELIECE6960119_VEC_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE6960119_VEC_load8(const unsigned char *in); + +#endif + diff --git a/crypto_kem/mceliece6960119/vec/vec.c b/crypto_kem/mceliece6960119/vec/vec.c new file mode 100644 index 00000000..514d2173 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/vec.c @@ -0,0 +1,138 @@ +#include "vec.h" + +#include "params.h" + +vec PQCLEAN_MCELIECE6960119_VEC_vec_setbits(vec b) { + vec ret = -b; + + return ret; +} + +vec PQCLEAN_MCELIECE6960119_VEC_vec_set1_16b(uint16_t v) { + vec ret; + + ret = v; + ret |= ret << 16; + ret |= ret << 32; + + return ret; +} + +void PQCLEAN_MCELIECE6960119_VEC_vec_copy(vec *out, const vec *in) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i] = in[i]; + } +} + +vec PQCLEAN_MCELIECE6960119_VEC_vec_or_reduce(const vec *a) { + int i; + vec ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret |= a[i]; + } + + return ret; +} + +int PQCLEAN_MCELIECE6960119_VEC_vec_testz(vec a) { + a |= a >> 32; + a |= a >> 16; + a |= a >> 8; + a |= a >> 4; + a |= a >> 2; + a |= a >> 1; + + return (int)(a & 1) ^ 1; +} + +void PQCLEAN_MCELIECE6960119_VEC_vec_mul(vec *h, const vec *f, const vec *g) { + int i, j; + vec buf[ 2 * GFBITS - 1 ]; + + for (i = 0; i < 2 * GFBITS - 1; i++) { + buf[i] = 0; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < GFBITS; j++) { + buf[i + j] ^= f[i] & g[j]; + } + } + + for (i = 2 * GFBITS - 2; i >= GFBITS; i--) { + buf[i - GFBITS + 4] ^= buf[i]; + buf[i - GFBITS + 3] ^= buf[i]; + buf[i - GFBITS + 1] ^= buf[i]; + buf[i - GFBITS + 0] ^= buf[i]; + } + + for (i = 0; i < GFBITS; i++) { + h[i] = buf[i]; + } +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE6960119_VEC_vec_sq(vec *out, const vec *in) { + int i; + vec result[GFBITS], t; + + t = in[11] ^ in[12]; + + result[0] = in[0] ^ in[11]; + result[1] = in[7] ^ t; + result[2] = in[1] ^ in[7]; + result[3] = in[8] ^ t; + result[4] = in[2] ^ in[7]; + result[4] = result[4] ^ in[8]; + result[4] = result[4] ^ t; + result[5] = in[7] ^ in[9]; + result[6] = in[3] ^ in[8]; + result[6] = result[6] ^ in[9]; + result[6] = result[6] ^ in[12]; + result[7] = in[8] ^ in[10]; + result[8] = in[4] ^ in[9]; + result[8] = result[8] ^ in[10]; + result[9] = in[9] ^ in[11]; + result[10] = in[5] ^ in[10]; + result[10] = result[10] ^ in[11]; + result[11] = in[10] ^ in[12]; + result[12] = in[6] ^ t; + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE6960119_VEC_vec_inv(vec *out, const vec *in) { + vec tmp_11[ GFBITS ]; + vec tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE6960119_VEC_vec_copy(out, in); + + PQCLEAN_MCELIECE6960119_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119_VEC_vec_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE6960119_VEC_vec_sq(out, tmp_11); + PQCLEAN_MCELIECE6960119_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119_VEC_vec_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE6960119_VEC_vec_sq(out, tmp_1111); + PQCLEAN_MCELIECE6960119_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119_VEC_vec_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE6960119_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119_VEC_vec_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE6960119_VEC_vec_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece6960119/vec/vec.h b/crypto_kem/mceliece6960119/vec/vec.h new file mode 100644 index 00000000..882fce58 --- /dev/null +++ b/crypto_kem/mceliece6960119/vec/vec.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE6960119_VEC_VEC_H +#define PQCLEAN_MCELIECE6960119_VEC_VEC_H + +#include "params.h" + +#include + +typedef uint64_t vec; + +vec PQCLEAN_MCELIECE6960119_VEC_vec_setbits(vec b); + +vec PQCLEAN_MCELIECE6960119_VEC_vec_set1_16b(uint16_t v); + +void PQCLEAN_MCELIECE6960119_VEC_vec_copy(vec *out, const vec *in); + +vec PQCLEAN_MCELIECE6960119_VEC_vec_or_reduce(const vec *a); + +int PQCLEAN_MCELIECE6960119_VEC_vec_testz(vec a); + +void PQCLEAN_MCELIECE6960119_VEC_vec_mul(vec * /*h*/, const vec * /*f*/, const vec * /*g*/); +void PQCLEAN_MCELIECE6960119_VEC_vec_sq(vec * /*out*/, const vec * /*in*/); +void PQCLEAN_MCELIECE6960119_VEC_vec_inv(vec * /*out*/, const vec * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/META.yml b/crypto_kem/mceliece6960119f/META.yml new file mode 100644 index 00000000..07842a87 --- /dev/null +++ b/crypto_kem/mceliece6960119f/META.yml @@ -0,0 +1,50 @@ +name: Classic McEliece 6960119f +type: kem +claimed-nist-level: 5 +claimed-security: IND-CCA2 +length-public-key: 1047319 +length-secret-key: 13908 +length-ciphertext: 226 +length-shared-secret: 32 +nistkat-sha256: 653ada51f795f7c606a6316f6c6db50f18804fe4a07aa26c78dc8f4ae2f9bccd +principal-submitters: + - Daniel J. Bernstein + - Tung Chou + - Tanja Lange + - Ingo von Maurich + - Rafael Misoczki + - Ruben Niederhagen + - Edoardo Persichetti + - Christiane Peters + - Peter Schwabe + - Nicolas Sendrier + - Jakub Szefer + - Wen Wang +auxiliary-submitters: [] +implementations: + - name: clean + version: SUPERCOP-20191221 + - name: vec + version: SUPERCOP-20191221 + - name: sse + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - sse4_1 + - bmi1 + - popcnt + - name: avx + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi1 + - popcnt diff --git a/crypto_kem/mceliece6960119f/avx/LICENSE b/crypto_kem/mceliece6960119f/avx/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece6960119f/avx/Makefile b/crypto_kem/mceliece6960119f/avx/Makefile new file mode 100644 index 00000000..ac86ca20 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/Makefile @@ -0,0 +1,44 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece6960119f_avx.a + + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c int32_sort.c operations.c pk_gen.c sk_gen.c \ + transpose.c uint32_sort.c util.c vec128.c vec256.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S \ + transpose_64x256_sp_asm.S update_asm.S vec128_mul_asm.S \ + vec256_ama_asm.S vec256_maa_asm.S vec256_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h int32_sort.h \ + params.h pk_gen.h sk_gen.h transpose.h uint32_sort.h util.h vec128.h \ + vec256.h \ + consts.inc scalars_2x.inc scalars_4x.inc + + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o int32_sort.o operations.o pk_gen.o sk_gen.o \ + transpose.o uint32_sort.o util.o vec128.o vec256.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + transpose_64x256_sp_asm.o update_asm.o vec128_mul_asm.o \ + vec256_ama_asm.o vec256_maa_asm.o vec256_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mbmi -mpopcnt -mavx2 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece6960119f/avx/aes256ctr.c b/crypto_kem/mceliece6960119f/avx/aes256ctr.c new file mode 100644 index 00000000..aba3b74d --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE6960119F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece6960119f/avx/aes256ctr.h b/crypto_kem/mceliece6960119f/avx/aes256ctr.h new file mode 100644 index 00000000..02e0ed00 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_AES256CTR_H +#define PQCLEAN_MCELIECE6960119F_AVX_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6960119F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6960119f/avx/api.h b/crypto_kem/mceliece6960119f/avx/api.h new file mode 100644 index 00000000..3ea68e54 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_API_H +#define PQCLEAN_MCELIECE6960119F_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_ALGNAME "Classic McEliece 6960119f" +#define PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_PUBLICKEYBYTES 1047319 +#define PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_SECRETKEYBYTES 13908 +#define PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_CIPHERTEXTBYTES 226 +#define PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif diff --git a/crypto_kem/mceliece6960119f/avx/benes.c b/crypto_kem/mceliece6960119f/avx/benes.c new file mode 100644 index 00000000..a1c9e78c --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE6960119F_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(PQCLEAN_MCELIECE6960119F_AVX_load8(ptr), PQCLEAN_MCELIECE6960119F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE6960119F_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(PQCLEAN_MCELIECE6960119F_AVX_load8(ptr), PQCLEAN_MCELIECE6960119F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6960119F_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece6960119f/avx/benes.h b/crypto_kem/mceliece6960119f/avx/benes.h new file mode 100644 index 00000000..5dd0156b --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_BENES_H +#define PQCLEAN_MCELIECE6960119F_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE6960119F_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE6960119F_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/avx/bm.c b/crypto_kem/mceliece6960119f/avx/bm.c new file mode 100644 index 00000000..47cfa558 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/bm.c @@ -0,0 +1,210 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE6960119F_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE6960119F_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE6960119F_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE6960119F_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE6960119F_AVX_vec256_or(PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE6960119F_AVX_vec256_sll_4x(PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE6960119F_AVX_vec256_or(PQCLEAN_MCELIECE6960119F_AVX_vec256_srl_4x(PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6960119F_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + uint64_t v[2]; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + vec128 db[ GFBITS ][ 2 ]; + vec128 BC_tmp[ GFBITS ][ 2 ]; + vec128 BC[ GFBITS ][ 2 ]; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC[0][0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0, one << 62); + BC[0][1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0, one << 63); + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setzero(); + } + + for (N = 0; N < SYS_T * 2; N++) { + PQCLEAN_MCELIECE6960119F_AVX_update_asm(interval, coefs[N], 16); + PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm(prod, interval, BC[0] + 1, 32); + + d = PQCLEAN_MCELIECE6960119F_AVX_vec_reduce_asm(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits((d >> i) & 1); + db[i][1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul((vec256 *) BC_tmp, (vec256 *) db, (vec256 *) BC); + + vec128_cmov(BC, mask); + PQCLEAN_MCELIECE6960119F_AVX_update_asm(BC, 0, 32); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(BC_tmp[i][0], BC_tmp[i][1]); + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(BC[i][1], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(BC[i][1], 1); + + out[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x((v[0] >> 8) | (v[1] << 56), v[1] >> 8); + } +} + diff --git a/crypto_kem/mceliece6960119f/avx/bm.h b/crypto_kem/mceliece6960119f/avx/bm.h new file mode 100644 index 00000000..6246a339 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_BM_H +#define PQCLEAN_MCELIECE6960119F_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6960119F_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/avx/consts.S b/crypto_kem/mceliece6960119f/avx/consts.S new file mode 100644 index 00000000..a3096c1f --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE6960119F_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE6960119F_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE6960119F_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE6960119F_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE6960119F_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE6960119F_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE6960119F_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE6960119F_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE6960119F_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE6960119F_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE6960119F_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE6960119F_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece6960119f/avx/consts.inc b/crypto_kem/mceliece6960119f/avx/consts.inc new file mode 100644 index 00000000..7f87f10e --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/crypto_kem/mceliece6960119f/avx/controlbits.c b/crypto_kem/mceliece6960119f/avx/controlbits.c new file mode 100644 index 00000000..1982a804 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6960119F_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6960119F_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6960119F_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6960119F_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6960119f/avx/controlbits.h b/crypto_kem/mceliece6960119f/avx/controlbits.h new file mode 100644 index 00000000..ef3b57e5 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE6960119F_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6960119F_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6960119F_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6960119f/avx/crypto_hash.h b/crypto_kem/mceliece6960119f/avx/crypto_hash.h new file mode 100644 index 00000000..3676f10a --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6960119f/avx/decrypt.c b/crypto_kem/mceliece6960119f/avx/decrypt.c new file mode 100644 index 00000000..493367f4 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/decrypt.c @@ -0,0 +1,236 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE6960119F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6960119F_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6960119F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + r[i - 1] &= (1 << ((GFBITS * SYS_T) % 8)) - 1; // throwing away redundant bits + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE6960119F_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE6960119F_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE6960119F_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6960119F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec256_or(diff, PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE6960119F_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6960119F_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 32 ][ GFBITS ]; + vec256 scaled[ 32 ][ GFBITS ]; + vec256 eval[32][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE6960119F_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE6960119F_AVX_benes(recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE6960119F_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6960119F_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE6960119F_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE6960119F_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE6960119F_AVX_benes(error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece6960119f/avx/decrypt.h b/crypto_kem/mceliece6960119f/avx/decrypt.h new file mode 100644 index 00000000..7aed1ae2 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE6960119F_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6960119F_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/avx/encrypt.c b/crypto_kem/mceliece6960119f/avx/encrypt.c new file mode 100644 index 00000000..e830305a --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/encrypt.c @@ -0,0 +1,104 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE6960119F_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE6960119F_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6960119F_AVX_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6960119F_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + PQCLEAN_MCELIECE6960119F_AVX_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece6960119f/avx/encrypt.h b/crypto_kem/mceliece6960119f/avx/encrypt.h new file mode 100644 index 00000000..bacb1b37 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE6960119F_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6960119F_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/avx/fft.c b/crypto_kem/mceliece6960119f/avx/fft.c new file mode 100644 index 00000000..72d02d4b --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/fft.c @@ -0,0 +1,262 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + // transpose + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6960119F_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece6960119f/avx/fft.h b/crypto_kem/mceliece6960119f/avx/fft.h new file mode 100644 index 00000000..21ef0e9a --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_FFT_H +#define PQCLEAN_MCELIECE6960119F_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE6960119F_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/avx/fft_tr.c b/crypto_kem/mceliece6960119f/avx/fft_tr.c new file mode 100644 index 00000000..44d48385 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/fft_tr.c @@ -0,0 +1,400 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +/* justifying the length of the output */ +static void postprocess(vec256 *out) { + int i; + uint64_t v[4]; + + for (i = 0; i < 13; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(out[i], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(out[i], 1); + v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(out[i], 2); + v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(out[i], 3); + + v[3] <<= (128 - SYS_T) * 2; + v[3] >>= (128 - SYS_T) * 2; + + out[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + + +void PQCLEAN_MCELIECE6960119F_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/crypto_kem/mceliece6960119f/avx/fft_tr.h b/crypto_kem/mceliece6960119f/avx/fft_tr.h new file mode 100644 index 00000000..464413bc --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE6960119F_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6960119F_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6960119f/avx/gf.c b/crypto_kem/mceliece6960119f/avx/gf.c new file mode 100644 index 00000000..a9949c30 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/gf.c @@ -0,0 +1,203 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6960119F_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE6960119F_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6960119F_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6960119F_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE6960119F_AVX_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6960119F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[237]; + + for (i = 0; i < 237; i++) { + prod[i] = 0; + } + + for (i = 0; i < 119; i++) { + for (j = 0; j < 119; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6960119F_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 236; i >= 119; i--) { + prod[i - 117] ^= PQCLEAN_MCELIECE6960119F_AVX_gf_mul(prod[i], (gf) 6400); + prod[i - 119] ^= PQCLEAN_MCELIECE6960119F_AVX_gf_mul(prod[i], (gf) 3134); + } + + for (i = 0; i < 119; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6960119f/avx/gf.h b/crypto_kem/mceliece6960119f/avx/gf.h new file mode 100644 index 00000000..fc4686cf --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_GF_H +#define PQCLEAN_MCELIECE6960119F_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6960119F_AVX_gf_iszero(gf a); +gf PQCLEAN_MCELIECE6960119F_AVX_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE6960119F_AVX_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE6960119F_AVX_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE6960119F_AVX_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE6960119F_AVX_gf_inv(gf in); + +void PQCLEAN_MCELIECE6960119F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece6960119f/avx/int32_sort.c b/crypto_kem/mceliece6960119f/avx/int32_sort.c new file mode 100644 index 00000000..e8e6dd0b --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/int32_sort.c @@ -0,0 +1,1208 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = (p << 1 == q); + flipflip = !flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE6960119F_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE6960119F_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/crypto_kem/mceliece6960119f/avx/int32_sort.h b/crypto_kem/mceliece6960119f/avx/int32_sort.h new file mode 100644 index 00000000..88326897 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE6960119F_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE6960119F_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece6960119f/avx/operations.c b/crypto_kem/mceliece6960119f/avx/operations.c new file mode 100644 index 00000000..141987d8 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6960119F_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6960119F_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6960119F_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6960119F_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6960119F_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6960119F_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6960119F_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6960119F_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6960119F_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6960119F_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119f/avx/params.h b/crypto_kem/mceliece6960119f/avx/params.h new file mode 100644 index 00000000..b97615e4 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_PARAMS_H +#define PQCLEAN_MCELIECE6960119F_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6960 +#define SYS_T 119 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6960119f/avx/pk_gen.c b/crypto_kem/mceliece6960119f/avx/pk_gen.c new file mode 100644 index 00000000..4f8e1529 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/pk_gen.c @@ -0,0 +1,372 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 255) / 256) * 4 ], uint32_t *perm) { + int i, j, k, s, block_idx, row, tail; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + tail = row % 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> tail) | + (mat[ row + i ][ block_idx + 1 ] << (64 - tail)); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> tail) | + (mat[ i + j ][ block_idx + 1 ] << (64 - tail)); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << (64 - tail) >> (64 - tail)) | (buf[j] << tail); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> tail << tail) | (buf[j] >> (64 - tail)); + } + } + + return 0; +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE6960119F_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS1_I - 1; + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ NBLOCKS2_H * 4 ]; + + // compute the inverses + + PQCLEAN_MCELIECE6960119F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6960119F_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6960119F_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[k] = mat[ row ][k]; + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + one_row[k] = (one_row[k] >> tail) | (one_row[k + 1] << (64 - tail)); + PQCLEAN_MCELIECE6960119F_AVX_store8(pk, one_row[k]); + pk += 8; + } + + one_row[k] >>= tail; + PQCLEAN_MCELIECE6960119F_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk[ (PK_ROW_BYTES % 8) - 1 ] &= (1 << (PK_NCOLS % 8)) - 1; // removing redundant bits + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece6960119f/avx/pk_gen.h b/crypto_kem/mceliece6960119f/avx/pk_gen.h new file mode 100644 index 00000000..ceddc05b --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE6960119F_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6960119F_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/avx/scalars_2x.inc b/crypto_kem/mceliece6960119f/avx/scalars_2x.inc new file mode 100644 index 00000000..2f9b7474 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece6960119f/avx/scalars_4x.inc b/crypto_kem/mceliece6960119f/avx/scalars_4x.inc new file mode 100644 index 00000000..b897c2c5 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/crypto_kem/mceliece6960119f/avx/sk_gen.c b/crypto_kem/mceliece6960119f/avx/sk_gen.c new file mode 100644 index 00000000..21a02ac4 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6960119F_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6960119F_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6960119F_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6960119F_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6960119F_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6960119F_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6960119F_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6960119F_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119f/avx/sk_gen.h b/crypto_kem/mceliece6960119f/avx/sk_gen.h new file mode 100644 index 00000000..025f23c2 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE6960119F_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6960119F_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6960119F_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/avx/syndrome_asm.S b/crypto_kem/mceliece6960119f/avx/syndrome_asm.S new file mode 100644 index 00000000..b5cabf8f --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/syndrome_asm.S @@ -0,0 +1,921 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 b0 + +# qhasm: int64 b1 + +# qhasm: int64 i + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: int64 tmp + +# qhasm: stack64 back + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119F_AVX_syndrome_asm +.global PQCLEAN_MCELIECE6960119F_AVX_syndrome_asm +_PQCLEAN_MCELIECE6960119F_AVX_syndrome_asm: +PQCLEAN_MCELIECE6960119F_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $64,%r11 +sub %r11,%rsp + +# qhasm: input_2 += 193 +# asm 1: add $193,tmp=int64#4 +# asm 2: movzbq 0(tmp=%rcx +movzbq 0(%rdx),%rcx + +# qhasm: back = tmp +# asm 1: movq back=stack64#1 +# asm 2: movq back=32(%rsp) +movq %rcx,32(%rsp) + +# qhasm: i = 0 +# asm 1: mov $0,>i=int64#4 +# asm 2: mov $0,>i=%rcx +mov $0,%rcx + +# qhasm: inner1: +._inner1: + +# qhasm: addr = input_2 + i +# asm 1: lea (addr=int64#5 +# asm 2: lea (addr=%r8 +lea (%rdx,%rcx),%r8 + +# qhasm: b0 = *(uint8 *) (addr + 0) +# asm 1: movzbq 0(b0=int64#6 +# asm 2: movzbq 0(b0=%r9 +movzbq 0(%r8),%r9 + +# qhasm: b1 = *(uint8 *) (addr + 1) +# asm 1: movzbq 1(b1=int64#7 +# asm 2: movzbq 1(b1=%rax +movzbq 1(%r8),%rax + +# qhasm: (uint64) b0 >>= 3 +# asm 1: shr $3,b0=int64#4 +# asm 2: movzbq 1(b0=%rcx +movzbq 1(%r8),%rcx + +# qhasm: (uint64) b0 >>= 3 +# asm 1: shr $3,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1547 +# asm 1: mov $1547,>row=int64#5 +# asm 2: mov $1547,>row=%r8 +mov $1547,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#3 +# asm 2: vmovupd 32(ee=%ymm2 +vmovupd 32(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#3 +# asm 2: vmovupd 64(ee=%ymm2 +vmovupd 64(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#3 +# asm 2: vmovupd 96(ee=%ymm2 +vmovupd 96(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#3 +# asm 2: vmovupd 128(ee=%ymm2 +vmovupd 128(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#3 +# asm 2: vmovupd 160(ee=%ymm2 +vmovupd 160(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 192 ] +# asm 1: vmovupd 192(ee=reg256#3 +# asm 2: vmovupd 192(ee=%ymm2 +vmovupd 192(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 224 ] +# asm 1: vmovupd 224(ee=reg256#3 +# asm 2: vmovupd 224(ee=%ymm2 +vmovupd 224(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 256 ] +# asm 1: vmovupd 256(ee=reg256#3 +# asm 2: vmovupd 256(ee=%ymm2 +vmovupd 256(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 288 ] +# asm 1: vmovupd 288(ee=reg256#3 +# asm 2: vmovupd 288(ee=%ymm2 +vmovupd 288(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 320 ] +# asm 1: vmovupd 320(ee=reg256#3 +# asm 2: vmovupd 320(ee=%ymm2 +vmovupd 320(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 352 ] +# asm 1: vmovupd 352(ee=reg256#3 +# asm 2: vmovupd 352(ee=%ymm2 +vmovupd 352(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 384 ] +# asm 1: vmovupd 384(ee=reg256#3 +# asm 2: vmovupd 384(ee=%ymm2 +vmovupd 384(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 416(pp=%ymm1 +vmovupd 416(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 416 ] +# asm 1: vmovupd 416(ee=reg256#3 +# asm 2: vmovupd 416(ee=%ymm2 +vmovupd 416(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 448(pp=%ymm1 +vmovupd 448(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 448 ] +# asm 1: vmovupd 448(ee=reg256#3 +# asm 2: vmovupd 448(ee=%ymm2 +vmovupd 448(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 480(pp=%ymm1 +vmovupd 480(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 480 ] +# asm 1: vmovupd 480(ee=reg256#3 +# asm 2: vmovupd 480(ee=%ymm2 +vmovupd 480(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 512(pp=%ymm1 +vmovupd 512(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 512 ] +# asm 1: vmovupd 512(ee=reg256#3 +# asm 2: vmovupd 512(ee=%ymm2 +vmovupd 512(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 544(pp=%ymm1 +vmovupd 544(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 544 ] +# asm 1: vmovupd 544(ee=reg256#3 +# asm 2: vmovupd 544(ee=%ymm2 +vmovupd 544(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 576(pp=%ymm1 +vmovupd 576(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 576 ] +# asm 1: vmovupd 576(ee=reg256#3 +# asm 2: vmovupd 576(ee=%ymm2 +vmovupd 576(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 608(pp=%ymm1 +vmovupd 608(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 608 ] +# asm 1: vmovupd 608(ee=reg256#3 +# asm 2: vmovupd 608(ee=%ymm2 +vmovupd 608(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 640(pp=%ymm1 +vmovupd 640(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 640 ] +# asm 1: vmovupd 640(ee=reg256#3 +# asm 2: vmovupd 640(ee=%ymm2 +vmovupd 640(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = *(uint32 *) (input_1 + 672) +# asm 1: movl 672(s=int64#6d +# asm 2: movl 672(s=%r9d +movl 672(%rsi),%r9d + +# qhasm: e = *(uint32 *) (input_2 + 672) +# asm 1: movl 672(e=int64#7d +# asm 2: movl 672(e=%eax +movl 672(%rdx),%eax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movzbq 676(p=%rax +movzbq 676(%rsi),%rax + +# qhasm: e = *(uint8 *) (input_2 + 676) +# asm 1: movzbq 676(e=int64#8 +# asm 2: movzbq 676(e=%r10 +movzbq 676(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,i=int64#2 +# asm 2: mov $676,>i=%rsi +mov $676,%rsi + +# qhasm: inner2: +._inner2: + +# qhasm: i -= 1 +# asm 1: sub $1,addr=int64#4 +# asm 2: lea (addr=%rcx +lea (%rdx,%rsi),%rcx + +# qhasm: b0 = *(uint8 *) (addr + 0) +# asm 1: movzbq 0(b0=int64#5 +# asm 2: movzbq 0(b0=%r8 +movzbq 0(%rcx),%r8 + +# qhasm: b1 = *(uint8 *) (addr + 1) +# asm 1: movzbq 1(b1=int64#6 +# asm 2: movzbq 1(b1=%r9 +movzbq 1(%rcx),%r9 + +# qhasm: (uint64) b0 >>= 5 +# asm 1: shr $5,tmp=int64#2 +# asm 2: movq tmp=%rsi +movq 32(%rsp),%rsi + +# qhasm: *(uint8 *) (input_2 + 0) = tmp +# asm 1: movb i=int64#2 +# asm 2: mov $0,>i=%rsi +mov $0,%rsi + +# qhasm: inner3: +._inner3: + +# qhasm: s = *(uint8 *) (input_0 + 0) +# asm 1: movzbq 0(s=int64#4 +# asm 2: movzbq 0(s=%rcx +movzbq 0(%rdi),%rcx + +# qhasm: e = *(uint8 *) (input_2 + 0) +# asm 1: movzbq 0(e=int64#5 +# asm 2: movzbq 0(e=%r8 +movzbq 0(%rdx),%r8 + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movzbq 0(s=%rsi +movzbq 0(%rdi),%rsi + +# qhasm: e = *(uint8 *) (input_2 + 0) +# asm 1: movzbq 0(e=int64#3 +# asm 2: movzbq 0(e=%rdx +movzbq 0(%rdx),%rdx + +# qhasm: (uint32) e &= 7 +# asm 1: and $7,mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE6960119F_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece6960119f/avx/update_asm.S b/crypto_kem/mceliece6960119f/avx/update_asm.S new file mode 100644 index 00000000..e57ae7fb --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119F_AVX_update_asm +.global PQCLEAN_MCELIECE6960119F_AVX_update_asm +_PQCLEAN_MCELIECE6960119F_AVX_update_asm: +PQCLEAN_MCELIECE6960119F_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE6960119F_AVX_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6960119F_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6960119F_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6960119F_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6960119F_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6960119F_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6960119F_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6960119F_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x( PQCLEAN_MCELIECE6960119F_AVX_load8(in), PQCLEAN_MCELIECE6960119F_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE6960119F_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE6960119F_AVX_store8(out + 0, PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE6960119F_AVX_store8(out + 8, PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece6960119f/avx/util.h b/crypto_kem/mceliece6960119f/avx/util.h new file mode 100644 index 00000000..c1001a62 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_UTIL_H +#define PQCLEAN_MCELIECE6960119F_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE6960119F_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE6960119F_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE6960119F_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE6960119F_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE6960119F_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE6960119F_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE6960119F_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE6960119F_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE6960119F_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece6960119f/avx/vec128.c b/crypto_kem/mceliece6960119f/avx/vec128.c new file mode 100644 index 00000000..d2c9cdf2 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE6960119F_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE6960119F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE6960119F_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/crypto_kem/mceliece6960119f/avx/vec128.h b/crypto_kem/mceliece6960119f/avx/vec128.h new file mode 100644 index 00000000..765be179 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_VEC128_H +#define PQCLEAN_MCELIECE6960119F_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE6960119F_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE6960119F_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE6960119F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/crypto_kem/mceliece6960119f/avx/vec128_mul_asm.S b/crypto_kem/mceliece6960119f/avx/vec128_mul_asm.S new file mode 100644 index 00000000..8f8a0cb8 --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE6960119F_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE6960119F_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE6960119F_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE6960119F_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/crypto_kem/mceliece6960119f/avx/vec256_ama_asm.S b/crypto_kem/mceliece6960119f/avx/vec256_ama_asm.S new file mode 100644 index 00000000..14f698fd --- /dev/null +++ b/crypto_kem/mceliece6960119f/avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6960119F_CLEAN_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6960119f/clean/api.h b/crypto_kem/mceliece6960119f/clean/api.h new file mode 100644 index 00000000..056a0999 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_API_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_API_H + +#include + +#define PQCLEAN_MCELIECE6960119F_CLEAN_CRYPTO_ALGNAME "Classic McEliece 6960119f" +#define PQCLEAN_MCELIECE6960119F_CLEAN_CRYPTO_PUBLICKEYBYTES 1047319 +#define PQCLEAN_MCELIECE6960119F_CLEAN_CRYPTO_SECRETKEYBYTES 13908 +#define PQCLEAN_MCELIECE6960119F_CLEAN_CRYPTO_CIPHERTEXTBYTES 226 +#define PQCLEAN_MCELIECE6960119F_CLEAN_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6960119F_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6960119F_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6960119F_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif diff --git a/crypto_kem/mceliece6960119f/clean/benes.c b/crypto_kem/mceliece6960119f/clean/benes.c new file mode 100644 index 00000000..4104f5f8 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/benes.c @@ -0,0 +1,180 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6960119F_CLEAN_apply_benes(unsigned char *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + unsigned char *r_ptr = r; + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = PQCLEAN_MCELIECE6960119F_CLEAN_load8(r_ptr + i * 16 + 0); + r_int_v[1][i] = PQCLEAN_MCELIECE6960119F_CLEAN_load8(r_ptr + i * 16 + 8); + } + + PQCLEAN_MCELIECE6960119F_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6960119F_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6960119F_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6960119F_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6960119F_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE6960119F_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6960119F_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6960119F_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6960119F_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6960119F_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE6960119F_CLEAN_store8(r_ptr + i * 16 + 0, r_int_v[0][i]); + PQCLEAN_MCELIECE6960119F_CLEAN_store8(r_ptr + i * 16 + 8, r_int_v[1][i]); + } +} + +/* input: condition bits c */ +/* output: support s */ +void PQCLEAN_MCELIECE6960119F_CLEAN_support_gen(gf *s, const unsigned char *c) { + gf a; + int i, j; + unsigned char L[ GFBITS ][ (1 << GFBITS) / 8 ]; + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < (1 << GFBITS) / 8; j++) { + L[i][j] = 0; + } + } + + for (i = 0; i < (1 << GFBITS); i++) { + a = PQCLEAN_MCELIECE6960119F_CLEAN_bitrev((gf) i); + + for (j = 0; j < GFBITS; j++) { + L[j][ i / 8 ] |= ((a >> j) & 1) << (i % 8); + } + } + + for (j = 0; j < GFBITS; j++) { + PQCLEAN_MCELIECE6960119F_CLEAN_apply_benes(L[j], c, 0); + } + + for (i = 0; i < SYS_N; i++) { + s[i] = 0; + for (j = GFBITS - 1; j >= 0; j--) { + s[i] <<= 1; + s[i] |= (L[j][i / 8] >> (i % 8)) & 1; + } + } +} + diff --git a/crypto_kem/mceliece6960119f/clean/benes.h b/crypto_kem/mceliece6960119f/clean/benes.h new file mode 100644 index 00000000..47f35263 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/benes.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_BENES_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_BENES_H +/* + This file is for Benes network related functions +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE6960119F_CLEAN_apply_benes(uint8_t *r, const uint8_t *bits, int rev); +void PQCLEAN_MCELIECE6960119F_CLEAN_support_gen(gf *s, const uint8_t *c); + +#endif + diff --git a/crypto_kem/mceliece6960119f/clean/bm.c b/crypto_kem/mceliece6960119f/clean/bm.c new file mode 100644 index 00000000..e5dfc03f --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/bm.c @@ -0,0 +1,83 @@ +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ +#include "bm.h" + +#include "params.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +/* the Berlekamp-Massey algorithm */ +/* input: s, sequence of field elements */ +/* output: out, minimal polynomial of s */ +void PQCLEAN_MCELIECE6960119F_CLEAN_bm(gf *out, gf *s) { + int i; + + uint16_t N = 0; + uint16_t L = 0; + uint16_t mle; + uint16_t mne; + + gf T[ SYS_T + 1 ]; + gf C[ SYS_T + 1 ]; + gf B[ SYS_T + 1 ]; + + gf b = 1, d, f; + + // + + for (i = 0; i < SYS_T + 1; i++) { + C[i] = B[i] = 0; + } + + B[1] = C[0] = 1; + + // + + for (N = 0; N < 2 * SYS_T; N++) { + d = 0; + + for (i = 0; i <= min(N, SYS_T); i++) { + d ^= PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(C[i], s[ N - i]); + } + + mne = d; + mne -= 1; + mne >>= 15; + mne -= 1; + mle = N; + mle -= 2 * L; + mle >>= 15; + mle -= 1; + mle &= mne; + + for (i = 0; i <= SYS_T; i++) { + T[i] = C[i]; + } + + f = PQCLEAN_MCELIECE6960119F_CLEAN_gf_frac(b, d); + + for (i = 0; i <= SYS_T; i++) { + C[i] ^= PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(f, B[i]) & mne; + } + + L = (L & ~mle) | ((N + 1 - L) & mle); + + for (i = 0; i <= SYS_T; i++) { + B[i] = (B[i] & ~mle) | (T[i] & mle); + } + + b = (b & ~mle) | (d & mle); + + for (i = SYS_T; i >= 1; i--) { + B[i] = B[i - 1]; + } + B[0] = 0; + } + + for (i = 0; i <= SYS_T; i++) { + out[i] = C[ SYS_T - i ]; + } +} + diff --git a/crypto_kem/mceliece6960119f/clean/bm.h b/crypto_kem/mceliece6960119f/clean/bm.h new file mode 100644 index 00000000..b512a981 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/bm.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_BM_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_BM_H +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE6960119F_CLEAN_bm(gf * /*out*/, gf * /*s*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/clean/controlbits.c b/crypto_kem/mceliece6960119f/clean/controlbits.c new file mode 100644 index 00000000..6c8f8408 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6960119F_CLEAN_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6960119F_CLEAN_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6960119F_CLEAN_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6960119F_CLEAN_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6960119f/clean/controlbits.h b/crypto_kem/mceliece6960119f/clean/controlbits.h new file mode 100644 index 00000000..60294352 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_CONTROLBITS_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6960119F_CLEAN_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6960119F_CLEAN_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6960119f/clean/crypto_hash.h b/crypto_kem/mceliece6960119f/clean/crypto_hash.h new file mode 100644 index 00000000..fae1e006 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6960119f/clean/decrypt.c b/crypto_kem/mceliece6960119f/clean/decrypt.c new file mode 100644 index 00000000..8af678e8 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/decrypt.c @@ -0,0 +1,92 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" +#include + +#include "benes.h" +#include "bm.h" +#include "gf.h" +#include "params.h" +#include "root.h" +#include "synd.h" +#include "util.h" + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6960119F_CLEAN_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i, w = 0; + uint16_t check; + + unsigned char r[ SYS_N / 8 ]; + + gf g[ SYS_T + 1 ]; + gf L[ SYS_N ]; + + gf s[ SYS_T * 2 ]; + gf s_cmp[ SYS_T * 2 ]; + gf locator[ SYS_T + 1 ]; + gf images[ SYS_N ]; + + gf t; + + // + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = c[i]; + } + r[i - 1] &= (1 << ((GFBITS * SYS_T) % 8)) - 1; + for (i = SYND_BYTES; i < SYS_N / 8; i++) { + r[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE6960119F_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + g[ SYS_T ] = 1; + + PQCLEAN_MCELIECE6960119F_CLEAN_support_gen(L, sk); + + PQCLEAN_MCELIECE6960119F_CLEAN_synd(s, g, L, r); + + PQCLEAN_MCELIECE6960119F_CLEAN_bm(locator, s); + + PQCLEAN_MCELIECE6960119F_CLEAN_root(images, locator, L); + + // + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + } + + for (i = 0; i < SYS_N; i++) { + t = PQCLEAN_MCELIECE6960119F_CLEAN_gf_iszero(images[i]) & 1; + + e[ i / 8 ] |= t << (i % 8); + w += t; + + } + + PQCLEAN_MCELIECE6960119F_CLEAN_synd(s_cmp, g, L, e); + + // + + check = (uint16_t)w; + check ^= SYS_T; + + for (i = 0; i < SYS_T * 2; i++) { + check |= s[i] ^ s_cmp[i]; + } + + check -= 1; + check >>= 15; + + return check ^ 1; +} + diff --git a/crypto_kem/mceliece6960119f/clean/decrypt.h b/crypto_kem/mceliece6960119f/clean/decrypt.h new file mode 100644 index 00000000..b657194a --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_DECRYPT_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6960119F_CLEAN_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/clean/encrypt.c b/crypto_kem/mceliece6960119f/clean/encrypt.c new file mode 100644 index 00000000..cc50266a --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/encrypt.c @@ -0,0 +1,144 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +static inline uint8_t same_mask(uint16_t x, uint16_t y) { + uint32_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 31; + mask = -mask; + + return (uint8_t)mask; +} + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint8_t *ind_8 = (uint8_t *)ind_; + uint16_t ind[ SYS_T * 2 ]; + unsigned char mask; + unsigned char val[ SYS_T ]; + + while (1) { + randombytes(ind_8, sizeof(ind_)); + // Copy to uint16_t ind_ in a little-endian way + for (i = 0; i < sizeof(ind_); i += 2) { + ind_[i / 2] = ind_8[i + 1] << 8 | ind_8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = 1 << (ind[j] & 7); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = same_mask((uint16_t)i, ind[j] >> 3); + + e[i] |= val[j] & mask; + } + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + unsigned char b, row[SYS_N / 8]; + const unsigned char *pk_ptr = pk; + + int i, j, tail = PK_NROWS % 8; + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = 0; + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + row[j] = 0; + } + + for (j = 0; j < PK_ROW_BYTES; j++) { + row[ SYS_N / 8 - PK_ROW_BYTES + j ] = pk_ptr[j]; + } + + for (j = SYS_N / 8 - 1; j >= SYS_N / 8 - PK_ROW_BYTES; j--) { + row[ j ] = (row[ j ] << tail) | (row[j - 1] >> (8 - tail)); + } + + row[i / 8] |= 1 << (i % 8); + + b = 0; + for (j = 0; j < SYS_N / 8; j++) { + b ^= row[j] & e[j]; + } + + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] |= (b << (i % 8)); + + pk_ptr += PK_ROW_BYTES; + } +} + +void PQCLEAN_MCELIECE6960119F_CLEAN_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece6960119f/clean/encrypt.h b/crypto_kem/mceliece6960119f/clean/encrypt.h new file mode 100644 index 00000000..3d09068b --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_ENCRYPT_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6960119F_CLEAN_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/clean/gf.c b/crypto_kem/mceliece6960119f/clean/gf.c new file mode 100644 index 00000000..6934be07 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/gf.c @@ -0,0 +1,209 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE6960119F_CLEAN_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE6960119F_CLEAN_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* input: field element in */ +/* return: (in^2)^2 */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: (in^2)*m */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: ((in^2)^2)*m */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE6960119F_CLEAN_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +gf PQCLEAN_MCELIECE6960119F_CLEAN_gf_inv(gf in) { + return PQCLEAN_MCELIECE6960119F_CLEAN_gf_frac(in, ((gf) 1)); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE6960119F_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 2] ^= PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(prod[i], (gf) 6400); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(prod[i], (gf) 3134); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6960119f/clean/gf.h b/crypto_kem/mceliece6960119f/clean/gf.h new file mode 100644 index 00000000..3ce57eff --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_GF_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6960119F_CLEAN_gf_iszero(gf a); +gf PQCLEAN_MCELIECE6960119F_CLEAN_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(gf in0, gf in1); +gf PQCLEAN_MCELIECE6960119F_CLEAN_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE6960119F_CLEAN_gf_inv(gf in); +uint64_t PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul2(gf a, gf b0, gf b1); + +void PQCLEAN_MCELIECE6960119F_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece6960119f/clean/operations.c b/crypto_kem/mceliece6960119f/clean/operations.c new file mode 100644 index 00000000..fc646ea6 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6960119F_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6960119F_CLEAN_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119F_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6960119F_CLEAN_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119F_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6960119F_CLEAN_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6960119F_CLEAN_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6960119F_CLEAN_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6960119F_CLEAN_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6960119F_CLEAN_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6960119F_CLEAN_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6960119F_CLEAN_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6960119F_CLEAN_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119f/clean/params.h b/crypto_kem/mceliece6960119f/clean/params.h new file mode 100644 index 00000000..c71bcd73 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_PARAMS_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6960 +#define SYS_T 119 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6960119f/clean/pk_gen.c b/crypto_kem/mceliece6960119f/clean/pk_gen.c new file mode 100644 index 00000000..e47482cf --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/pk_gen.c @@ -0,0 +1,326 @@ +/* + This file is for public-key generation +*/ + +#include +#include +#include +#include + +#include "controlbits.h" +#include "benes.h" +#include "params.h" +#include "pk_gen.h" +#include "root.h" +#include "util.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + int i, b, m = 0, r = 0; + + for (i = 0; i < 64; i++) { + b = (int)(in >> i) & 1; + m |= b; + r += (m ^ 1) & (b ^ 1); + } + + return r; +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint8_t mat[][ SYS_N / 8 ], uint32_t *perm) { + int i, j, k, s, block_idx, row, tail; + uint64_t buf[64], ctz_list[32], t, d, mask; + unsigned char tmp[9]; + + row = GFBITS * SYS_T - 32; + block_idx = row / 8; + tail = row % 8; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + for (j = 0; j < 9; j++) { + tmp[j] = mat[ row + i ][ block_idx + j ]; + } + for (j = 0; j < 8; j++) { + tmp[j] = (tmp[j] >> tail) | (tmp[j + 1] << (8 - tail)); + } + + buf[i] = PQCLEAN_MCELIECE6960119F_CLEAN_load8( tmp ); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + for (k = 0; k < 9; k++) { + tmp[k] = mat[ i + j ][ block_idx + k ]; + } + for (k = 0; k < 8; k++) { + tmp[k] = (tmp[k] >> tail) | (tmp[k + 1] << (8 - tail)); + } + + buf[j] = PQCLEAN_MCELIECE6960119F_CLEAN_load8( tmp ); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + PQCLEAN_MCELIECE6960119F_CLEAN_store8( tmp, buf[j] ); + + mat[ i + j ][ block_idx + 8 ] = (mat[ i + j ][ block_idx + 8 ] >> tail << tail) | (tmp[7] >> (8 - tail)); + mat[ i + j ][ block_idx + 0 ] = (tmp[0] << tail) | (mat[ i + j ][ block_idx ] << (8 - tail) >> (8 - tail)); + + for (k = 7; k >= 1; k--) { + mat[ i + j ][ block_idx + k ] = (tmp[k] << tail) | (tmp[k - 1] >> (8 - tail)); + } + } + } + + return 0; +} + +/* input: secret key sk */ +/* output: public key pk */ +int PQCLEAN_MCELIECE6960119F_CLEAN_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) { + unsigned char *pk_ptr = pk; + + int i, j, k; + int row, c, tail; + + uint64_t buf[ 1 << GFBITS ]; + + unsigned char mat[ GFBITS * SYS_T ][ SYS_N / 8 ]; + unsigned char mask; + unsigned char b; + + gf g[ SYS_T + 1 ]; // Goppa polynomial + gf L[ SYS_N ]; // support + gf inv[ SYS_N ]; + + // + + g[ SYS_T ] = 1; + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE6960119F_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + + for (i = 0; i < (1 << GFBITS); i++) { + buf[i] = perm[i]; + buf[i] <<= 31; + buf[i] |= i; + } + + PQCLEAN_MCELIECE6960119F_CLEAN_sort_63b(1 << GFBITS, buf); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = buf[i] & GFMASK; + } + for (i = 0; i < SYS_N; i++) { + L[i] = PQCLEAN_MCELIECE6960119F_CLEAN_bitrev((gf)perm[i]); + } + + // filling the matrix + + PQCLEAN_MCELIECE6960119F_CLEAN_root(inv, g, L); + + for (i = 0; i < SYS_N; i++) { + inv[i] = PQCLEAN_MCELIECE6960119F_CLEAN_gf_inv(inv[i]); + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + mat[i][j] = 0; + } + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_N; j += 8) { + for (k = 0; k < GFBITS; k++) { + b = (inv[j + 7] >> k) & 1; + b <<= 1; + b |= (inv[j + 6] >> k) & 1; + b <<= 1; + b |= (inv[j + 5] >> k) & 1; + b <<= 1; + b |= (inv[j + 4] >> k) & 1; + b <<= 1; + b |= (inv[j + 3] >> k) & 1; + b <<= 1; + b |= (inv[j + 2] >> k) & 1; + b <<= 1; + b |= (inv[j + 1] >> k) & 1; + b <<= 1; + b |= (inv[j + 0] >> k) & 1; + + mat[ i * GFBITS + k ][ j / 8 ] = b; + } + } + + for (j = 0; j < SYS_N; j++) { + inv[j] = PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(inv[j], L[j]); + } + + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) { + for (j = 0; j < 8; j++) { + row = i * 8 + j; + + if (row >= GFBITS * SYS_T) { + break; + } + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] ^ mat[ k ][ i ]; + mask >>= j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < GFBITS * SYS_T; k++) { + if (k != row) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + } + + tail = (GFBITS * SYS_T) % 8; + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = (GFBITS * SYS_T - 1) / 8; j < SYS_N / 8 - 1; j++) { + *pk_ptr++ = (mat[i][j] >> tail) | (mat[i][j + 1] << (8 - tail)); + } + + *pk_ptr++ = (mat[i][j] >> tail); + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119f/clean/pk_gen.h b/crypto_kem/mceliece6960119f/clean/pk_gen.h new file mode 100644 index 00000000..aaa625f6 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_PK_GEN_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE6960119F_CLEAN_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/clean/root.c b/crypto_kem/mceliece6960119f/clean/root.c new file mode 100644 index 00000000..71764450 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/root.c @@ -0,0 +1,33 @@ +/* + This file is for evaluating a polynomial at one or more field elements +*/ +#include "root.h" + +#include "params.h" + +/* input: polynomial f and field element a */ +/* return f(a) */ +gf PQCLEAN_MCELIECE6960119F_CLEAN_eval(gf *f, gf a) { + int i; + gf r; + + r = f[ SYS_T ]; + + for (i = SYS_T - 1; i >= 0; i--) { + r = PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(r, a); + r = PQCLEAN_MCELIECE6960119F_CLEAN_gf_add(r, f[i]); + } + + return r; +} + +/* input: polynomial f and list of field elements L */ +/* output: out = [ f(a) for a in L ] */ +void PQCLEAN_MCELIECE6960119F_CLEAN_root(gf *out, gf *f, gf *L) { + int i; + + for (i = 0; i < SYS_N; i++) { + out[i] = PQCLEAN_MCELIECE6960119F_CLEAN_eval(f, L[i]); + } +} + diff --git a/crypto_kem/mceliece6960119f/clean/root.h b/crypto_kem/mceliece6960119f/clean/root.h new file mode 100644 index 00000000..a0b2aafc --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/root.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_ROOT_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_ROOT_H +/* + This file is for evaluating a polynomial at one or more field elements +*/ + + +#include "gf.h" + +gf PQCLEAN_MCELIECE6960119F_CLEAN_eval(gf * /*f*/, gf /*a*/); +void PQCLEAN_MCELIECE6960119F_CLEAN_root(gf * /*out*/, gf * /*f*/, gf * /*L*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/clean/sk_gen.c b/crypto_kem/mceliece6960119f/clean/sk_gen.c new file mode 100644 index 00000000..97788838 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6960119F_CLEAN_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6960119F_CLEAN_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6960119F_CLEAN_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6960119F_CLEAN_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6960119F_CLEAN_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6960119F_CLEAN_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119f/clean/sk_gen.h b/crypto_kem/mceliece6960119f/clean/sk_gen.h new file mode 100644 index 00000000..940b314c --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_SK_GEN_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6960119F_CLEAN_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6960119F_CLEAN_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/clean/synd.c b/crypto_kem/mceliece6960119f/clean/synd.c new file mode 100644 index 00000000..7d0b827d --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/synd.c @@ -0,0 +1,33 @@ +/* + This file is for syndrome computation +*/ + +#include "synd.h" + +#include "params.h" +#include "root.h" + + +/* input: Goppa polynomial f, support L, received word r */ +/* output: out, the syndrome of length 2t */ +void PQCLEAN_MCELIECE6960119F_CLEAN_synd(gf *out, gf *f, gf *L, const unsigned char *r) { + int i, j; + gf e, e_inv, c; + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = 0; + } + + for (i = 0; i < SYS_N; i++) { + c = (r[i / 8] >> (i % 8)) & 1; + + e = PQCLEAN_MCELIECE6960119F_CLEAN_eval(f, L[i]); + e_inv = PQCLEAN_MCELIECE6960119F_CLEAN_gf_inv(PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(e, e)); + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = PQCLEAN_MCELIECE6960119F_CLEAN_gf_add(out[j], PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(e_inv, c)); + e_inv = PQCLEAN_MCELIECE6960119F_CLEAN_gf_mul(e_inv, L[i]); + } + } +} + diff --git a/crypto_kem/mceliece6960119f/clean/synd.h b/crypto_kem/mceliece6960119f/clean/synd.h new file mode 100644 index 00000000..c7591722 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/synd.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_SYND_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_SYND_H +/* + This file is for syndrome computation +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE6960119F_CLEAN_synd(gf * /*out*/, gf * /*f*/, gf * /*L*/, const unsigned char * /*r*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/clean/transpose.c b/crypto_kem/mceliece6960119f/clean/transpose.c new file mode 100644 index 00000000..1bce3c7b --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/transpose.c @@ -0,0 +1,42 @@ +/* + This file is for matrix transposition +*/ + +#include "transpose.h" + +#include + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE6960119F_CLEAN_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + diff --git a/crypto_kem/mceliece6960119f/clean/transpose.h b/crypto_kem/mceliece6960119f/clean/transpose.h new file mode 100644 index 00000000..ec71ae68 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/transpose.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_TRANSPOSE_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + +void PQCLEAN_MCELIECE6960119F_CLEAN_transpose_64x64(uint64_t * /*out*/, const uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/clean/util.c b/crypto_kem/mceliece6960119f/clean/util.c new file mode 100644 index 00000000..7419196d --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/util.c @@ -0,0 +1,67 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ + +#include "util.h" + +#include "params.h" + +void PQCLEAN_MCELIECE6960119F_CLEAN_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6960119F_CLEAN_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6960119F_CLEAN_load4(const unsigned char *in) { + int i; + uint32_t ret = in[3]; + + for (i = 2; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +void PQCLEAN_MCELIECE6960119F_CLEAN_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6960119F_CLEAN_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE6960119F_CLEAN_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 3; +} + diff --git a/crypto_kem/mceliece6960119f/clean/util.h b/crypto_kem/mceliece6960119f/clean/util.h new file mode 100644 index 00000000..86639980 --- /dev/null +++ b/crypto_kem/mceliece6960119f/clean/util.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6960119F_CLEAN_UTIL_H +#define PQCLEAN_MCELIECE6960119F_CLEAN_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include + +void PQCLEAN_MCELIECE6960119F_CLEAN_store2(unsigned char * /*dest*/, gf /*a*/); +uint16_t PQCLEAN_MCELIECE6960119F_CLEAN_load2(const unsigned char * /*src*/); + +uint32_t PQCLEAN_MCELIECE6960119F_CLEAN_load4(const unsigned char * /*in*/); + +void PQCLEAN_MCELIECE6960119F_CLEAN_store8(unsigned char * /*out*/, uint64_t /*in*/); +uint64_t PQCLEAN_MCELIECE6960119F_CLEAN_load8(const unsigned char * /*in*/); + +gf PQCLEAN_MCELIECE6960119F_CLEAN_bitrev(gf /*a*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/sse/LICENSE b/crypto_kem/mceliece6960119f/sse/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece6960119f/sse/Makefile b/crypto_kem/mceliece6960119f/sse/Makefile new file mode 100644 index 00000000..927dc8e2 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/Makefile @@ -0,0 +1,37 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece6960119f_sse.a + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c operations.c pk_gen.c sk_gen.c util.c vec128.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S update_asm.S \ + vec128_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h params.h \ + pk_gen.h sk_gen.h transpose.h util.h vec128.h \ + consts.inc scalars_2x.inc scalars_4x.inc + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o operations.o pk_gen.o sk_gen.o util.o vec128.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + update_asm.o vec128_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mbmi -mpopcnt -msse4.1 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece6960119f/sse/aes256ctr.c b/crypto_kem/mceliece6960119f/sse/aes256ctr.c new file mode 100644 index 00000000..93cfc48b --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE6960119F_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece6960119f/sse/aes256ctr.h b/crypto_kem/mceliece6960119f/sse/aes256ctr.h new file mode 100644 index 00000000..b079d110 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_AES256CTR_H +#define PQCLEAN_MCELIECE6960119F_SSE_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6960119F_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6960119f/sse/api.h b/crypto_kem/mceliece6960119f/sse/api.h new file mode 100644 index 00000000..73557280 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_API_H +#define PQCLEAN_MCELIECE6960119F_SSE_API_H + +#include + +#define PQCLEAN_MCELIECE6960119F_SSE_CRYPTO_ALGNAME "Classic McEliece 6960119f" +#define PQCLEAN_MCELIECE6960119F_SSE_CRYPTO_PUBLICKEYBYTES 1047319 +#define PQCLEAN_MCELIECE6960119F_SSE_CRYPTO_SECRETKEYBYTES 13908 +#define PQCLEAN_MCELIECE6960119F_SSE_CRYPTO_CIPHERTEXTBYTES 226 +#define PQCLEAN_MCELIECE6960119F_SSE_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6960119F_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6960119F_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6960119F_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif diff --git a/crypto_kem/mceliece6960119f/sse/benes.c b/crypto_kem/mceliece6960119f/sse/benes.c new file mode 100644 index 00000000..c912b0d7 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE6960119F_SSE_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(PQCLEAN_MCELIECE6960119F_SSE_load8(ptr), PQCLEAN_MCELIECE6960119F_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6960119F_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE6960119F_SSE_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(PQCLEAN_MCELIECE6960119F_SSE_load8(ptr), PQCLEAN_MCELIECE6960119F_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6960119F_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6960119F_SSE_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE6960119F_SSE_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6960119F_SSE_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6960119F_SSE_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE6960119F_SSE_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece6960119f/sse/benes.h b/crypto_kem/mceliece6960119f/sse/benes.h new file mode 100644 index 00000000..6a65a79c --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_BENES_H +#define PQCLEAN_MCELIECE6960119F_SSE_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE6960119F_SSE_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE6960119F_SSE_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/sse/bm.c b/crypto_kem/mceliece6960119f/sse/bm.c new file mode 100644 index 00000000..5d06ed97 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/bm.c @@ -0,0 +1,204 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +extern gf PQCLEAN_MCELIECE6960119F_SSE_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE6960119F_SSE_update_asm(vec128 *, gf); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 *out, vec128 *in, uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE6960119F_SSE_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE6960119F_SSE_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(in[i], m0); + v1 = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(out[i], m1); + out[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_or(v0, v1); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE6960119F_SSE_vec128_or(PQCLEAN_MCELIECE6960119F_SSE_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE6960119F_SSE_vec128_sll_2x(PQCLEAN_MCELIECE6960119F_SSE_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE6960119F_SSE_vec128_or(PQCLEAN_MCELIECE6960119F_SSE_vec128_srl_2x(PQCLEAN_MCELIECE6960119F_SSE_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE6960119F_SSE_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6960119F_SSE_bm(vec128 *out, vec128 in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + uint64_t v[2]; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + vec128 dd[ GFBITS ], bb[ GFBITS ]; + vec128 B[ GFBITS ], C[ GFBITS ]; + vec128 B_tmp[ GFBITS ], C_tmp[ GFBITS ]; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[128], in[1]); + + C[0] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0, one << 63); + B[0] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0, one << 62); + + for (i = 1; i < GFBITS; i++) { + C[i] = B[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_setzero(); + } + + for (N = 0; N < SYS_T * 2; N++) { + PQCLEAN_MCELIECE6960119F_SSE_update_asm(interval, coefs[N]); + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(prod, C, (vec128 *) interval); + d = PQCLEAN_MCELIECE6960119F_SSE_vec_reduce_asm(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_setbits((d >> i) & 1); + bb[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(B_tmp, dd, B); + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(C_tmp, bb, C); + + vec128_cmov(B, C, mask); + PQCLEAN_MCELIECE6960119F_SSE_update_asm(B, 0); + + for (i = 0; i < GFBITS; i++) { + C[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(B_tmp[i], C_tmp[i]); + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(C[i], 0); + v[1] = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(C[i], 1); + + out[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x((v[0] >> 8) | (v[1] << 56), v[1] >> 8); + } +} + diff --git a/crypto_kem/mceliece6960119f/sse/bm.h b/crypto_kem/mceliece6960119f/sse/bm.h new file mode 100644 index 00000000..fe2a9be6 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/bm.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_BM_H +#define PQCLEAN_MCELIECE6960119F_SSE_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE6960119F_SSE_bm(vec128 * /*out*/, vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6960119f/sse/consts.S b/crypto_kem/mceliece6960119f/sse/consts.S new file mode 100644 index 00000000..5cc432db --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/consts.S @@ -0,0 +1,32 @@ +.data + +# not supported on macos +#.section .rodata +.globl PQCLEAN_MCELIECE6960119F_SSE_MASK0_0 +.globl PQCLEAN_MCELIECE6960119F_SSE_MASK0_1 +.globl PQCLEAN_MCELIECE6960119F_SSE_MASK1_0 +.globl PQCLEAN_MCELIECE6960119F_SSE_MASK1_1 +.globl PQCLEAN_MCELIECE6960119F_SSE_MASK2_0 +.globl PQCLEAN_MCELIECE6960119F_SSE_MASK2_1 +.globl PQCLEAN_MCELIECE6960119F_SSE_MASK3_0 +.globl PQCLEAN_MCELIECE6960119F_SSE_MASK3_1 +.globl PQCLEAN_MCELIECE6960119F_SSE_MASK4_0 +.globl PQCLEAN_MCELIECE6960119F_SSE_MASK4_1 +.globl PQCLEAN_MCELIECE6960119F_SSE_MASK5_0 +.globl PQCLEAN_MCELIECE6960119F_SSE_MASK5_1 + +.p2align 4 + +PQCLEAN_MCELIECE6960119F_SSE_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE6960119F_SSE_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE6960119F_SSE_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE6960119F_SSE_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE6960119F_SSE_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE6960119F_SSE_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE6960119F_SSE_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE6960119F_SSE_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE6960119F_SSE_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE6960119F_SSE_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE6960119F_SSE_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE6960119F_SSE_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece6960119f/sse/consts.inc b/crypto_kem/mceliece6960119f/sse/consts.inc new file mode 100644 index 00000000..30188c8f --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/consts.inc @@ -0,0 +1,967 @@ +// 64 +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +// 128 +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), +}, +// 256 +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9669966969966996, 0X6996699696699669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +// 512 +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +// 1024 +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +// 2048 +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +// 4096 +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +} diff --git a/crypto_kem/mceliece6960119f/sse/controlbits.c b/crypto_kem/mceliece6960119f/sse/controlbits.c new file mode 100644 index 00000000..b3e37fc9 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6960119F_SSE_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6960119F_SSE_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6960119F_SSE_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6960119F_SSE_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6960119f/sse/controlbits.h b/crypto_kem/mceliece6960119f/sse/controlbits.h new file mode 100644 index 00000000..9a9f89bb --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_CONTROLBITS_H +#define PQCLEAN_MCELIECE6960119F_SSE_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6960119F_SSE_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6960119F_SSE_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6960119f/sse/crypto_hash.h b/crypto_kem/mceliece6960119f/sse/crypto_hash.h new file mode 100644 index 00000000..963365d1 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6960119F_SSE_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6960119f/sse/decrypt.c b/crypto_kem/mceliece6960119f/sse/decrypt.c new file mode 100644 index 00000000..23f6e494 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/decrypt.c @@ -0,0 +1,207 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include +#include + +static void scaling(vec128 out[][GFBITS], vec128 inv[][GFBITS], const unsigned char *sk, vec128 *recv) { + int i, j; + + vec128 irr_int[ GFBITS ]; + vec128 eval[64][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE6960119F_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6960119F_SSE_fft(eval, irr_int); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6960119F_SSE_vec128_copy(inv[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119F_SSE_vec128_inv(tmp, inv[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119F_SSE_vec128_copy(inv[0], tmp); + + // + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + r[i - 1] &= (1 << ((GFBITS * SYS_T) % 8)) - 1; // throwing away redundant bits + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE6960119F_SSE_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE6960119F_SSE_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE6960119F_SSE_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec128 out[][GFBITS], vec128 inv[][GFBITS], vec128 *recv) { + int i, j; + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64(PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u32( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec128 s0[][ GFBITS ], vec128 s1[][ GFBITS ]) { + int i, j; + vec128 diff; + + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_or(PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(s0[0][0], s1[0][0]), + PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(s0[1][0], s1[1][0])); + + for (i = 0; i < 2; i++) { + for (j = 1; j < GFBITS; j++) { + diff = PQCLEAN_MCELIECE6960119F_SSE_vec128_or(diff, PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(s0[i][j], s1[i][j])); + } + } + + return (uint16_t)PQCLEAN_MCELIECE6960119F_SSE_vec128_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6960119F_SSE_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec128 inv[ 64 ][ GFBITS ]; + vec128 scaled[ 64 ][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + + vec128 error[ 64 ]; + + vec128 s_priv[ 2 ][ GFBITS ]; + vec128 s_priv_cmp[ 2 ][ GFBITS ]; + + vec128 locator[ GFBITS ]; + + vec128 recv[ 64 ]; + vec128 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE6960119F_SSE_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE6960119F_SSE_benes(recv, bits_int, 1); + + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE6960119F_SSE_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6960119F_SSE_bm(locator, s_priv); + + PQCLEAN_MCELIECE6960119F_SSE_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6960119F_SSE_vec128_setbits(1); + + for (i = 0; i < 64; i++) { + error[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_or_reduce(eval[i]); + error[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(error[i], allone); + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE6960119F_SSE_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE6960119F_SSE_benes(error, bits_int, 0); + + postprocess(e, error); + + check_weight = weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece6960119f/sse/decrypt.h b/crypto_kem/mceliece6960119f/sse/decrypt.h new file mode 100644 index 00000000..03a97906 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_DECRYPT_H +#define PQCLEAN_MCELIECE6960119F_SSE_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6960119F_SSE_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/sse/encrypt.c b/crypto_kem/mceliece6960119f/sse/encrypt.c new file mode 100644 index 00000000..1c53e9e1 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/encrypt.c @@ -0,0 +1,105 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE6960119F_SSE_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind[ SYS_T * 2 ]; + uint32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *)ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind32[i] == ind32[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6960119F_SSE_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6960119F_SSE_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + PQCLEAN_MCELIECE6960119F_SSE_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece6960119f/sse/encrypt.h b/crypto_kem/mceliece6960119f/sse/encrypt.h new file mode 100644 index 00000000..9752ecc4 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_ENCRYPT_H +#define PQCLEAN_MCELIECE6960119F_SSE_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6960119F_SSE_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/sse/fft.c b/crypto_kem/mceliece6960119f/sse/fft.c new file mode 100644 index 00000000..31f19ba1 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/fft.c @@ -0,0 +1,231 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec128.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119F_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119F_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(in, in, s[j]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec128 out[][ GFBITS ], const vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec128 t[ GFBITS ]; + vec128 pre[8][ GFBITS ]; + vec128 buf[64]; + + uint64_t v0, v1; + uint64_t consts_ptr = 1; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre[i + 0][j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_low(tmp[j], tmp[j]); + pre[i + 1][j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(in[i], 0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(in[i], 0) ^ PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(pre[6][i], 0)); + + buf[1] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[0], pre[0][i]); + buf[16] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[0], pre[4][i]); + buf[3] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[1], pre[1][i]); + buf[48] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[16], pre[5][i]); + buf[49] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[48], pre[0][i]); + buf[2] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[0], pre[1][i]); + buf[51] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[49], pre[1][i]); + buf[6] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[2], pre[2][i]); + buf[50] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[51], pre[0][i]); + buf[7] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[6], pre[0][i]); + buf[54] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[50], pre[2][i]); + buf[5] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[7], pre[1][i]); + buf[55] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[54], pre[0][i]); + buf[53] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[55], pre[1][i]); + buf[4] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[0], pre[2][i]); + buf[52] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[53], pre[0][i]); + buf[12] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[4], pre[3][i]); + buf[60] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[52], pre[3][i]); + buf[13] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[12], pre[0][i]); + buf[61] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[60], pre[0][i]); + buf[15] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[13], pre[1][i]); + buf[63] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[61], pre[1][i]); + buf[14] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[15], pre[0][i]); + buf[62] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[63], pre[0][i]); + buf[10] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[14], pre[2][i]); + buf[58] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[62], pre[2][i]); + buf[11] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[10], pre[0][i]); + buf[59] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[58], pre[0][i]); + buf[9] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[11], pre[1][i]); + buf[57] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[59], pre[1][i]); + buf[56] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[57], pre[0][i]); + buf[8] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[0], pre[3][i]); + buf[40] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[56], pre[4][i]); + buf[24] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[8], pre[4][i]); + buf[41] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[40], pre[0][i]); + buf[25] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[24], pre[0][i]); + buf[43] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[41], pre[1][i]); + buf[27] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[25], pre[1][i]); + buf[42] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[43], pre[0][i]); + buf[26] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[27], pre[0][i]); + buf[46] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[42], pre[2][i]); + buf[30] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[26], pre[2][i]); + buf[47] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[46], pre[0][i]); + buf[31] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[30], pre[0][i]); + buf[45] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[47], pre[1][i]); + buf[29] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[31], pre[1][i]); + buf[44] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[45], pre[0][i]); + buf[28] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[29], pre[0][i]); + buf[36] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[44], pre[3][i]); + buf[20] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[28], pre[3][i]); + buf[37] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[36], pre[0][i]); + buf[21] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[20], pre[0][i]); + buf[39] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[37], pre[1][i]); + buf[23] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[21], pre[1][i]); + buf[38] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[39], pre[0][i]); + buf[22] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[23], pre[0][i]); + buf[34] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[38], pre[2][i]); + buf[18] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[22], pre[2][i]); + buf[35] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[34], pre[0][i]); + buf[19] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[18], pre[0][i]); + buf[33] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[35], pre[1][i]); + buf[17] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[19], pre[1][i]); + buf[32] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[33], pre[0][i]); + + PQCLEAN_MCELIECE6960119F_SSE_transpose_64x128_sp(buf); + + for (j = 0; j < 64; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 0; i <= 5; i++) { + s = 1 << i; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(tmp, out[k + s], (vec128 *) consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(out[k ][b], tmp[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(out[k + s][b], out[k][b]); + } + } + } + + consts_ptr += (1 << i); + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6960119F_SSE_fft(vec128 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece6960119f/sse/fft.h b/crypto_kem/mceliece6960119f/sse/fft.h new file mode 100644 index 00000000..b8f764ea --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_FFT_H +#define PQCLEAN_MCELIECE6960119F_SSE_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include + +void PQCLEAN_MCELIECE6960119F_SSE_fft(vec128 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/sse/fft_tr.c b/crypto_kem/mceliece6960119f/sse/fft_tr.c new file mode 100644 index 00000000..60046cf9 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/fft_tr.c @@ -0,0 +1,355 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec128 in[][ GFBITS ]) { + int i, j, k; + vec128 t, x0, x1; + + uint64_t v0, v1, v2, v3; + + const vec128 mask[6][2] = { + { + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec128 s[6][2][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(in[1], in[1], s[j][1]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(in[0][i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119F_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(in[0][i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119F_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(in[1][i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119F_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[1][i], t); + + t = PQCLEAN_MCELIECE6960119F_SSE_vec128_and(in[1][i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119F_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[1][i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + x0 = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_low(in[0][i], in[1][i]); + x1 = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_high(in[0][i], in[1][i]); + + x1 = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(x1, PQCLEAN_MCELIECE6960119F_SSE_vec128_srl_2x(x0, 32)); + x1 = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(x1, PQCLEAN_MCELIECE6960119F_SSE_vec128_sll_2x(x1, 32)); + + in[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_low(x0, x1); + in[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_high(x0, x1); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(in[0][i], 0); + v1 = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(in[0][i], 1); + v2 = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(in[1][i], 0); + v3 = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(in[1][i], 1); + + v3 ^= v2 ^= v1; + + in[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(v0, v1); + in[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(v2, v3); + } + + } +} + +static void butterflies_tr(vec128 out[][ GFBITS ], vec128 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec128 tmp0[ GFBITS ]; + vec128 tmp1[ GFBITS ]; + vec128 tmp[ GFBITS ]; + + vec128 pre[ 6 ][ GFBITS ]; + vec128 buf[ 64 ]; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 64; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 5; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[k][b], in[k + s][b]); + } + + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[k + s][b], tmp[b]); + } + } + } + } + + for (j = 0; j < 64; j += 2) { + for (i = 0; i < GFBITS; i++) { + tmp0[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_low(in[j][i], in[j + 1][i]); + tmp1[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_high(in[j][i], in[j + 1][i]); + } + + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(tmp0[b], tmp1[b]); + } + + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(tmp, tmp0, consts[0]); + + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(tmp1[b], tmp[b]); + } + + for (i = 0; i < GFBITS; i++) { + in[j + 0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_low(tmp0[i], tmp1[i]); + in[j + 1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_high(tmp0[i], tmp1[i]); + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 64; k++) { + buf[k] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE6960119F_SSE_transpose_64x128_sp(buf); + + pre[0][i] = buf[32]; + buf[33] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[33], buf[32]); + pre[1][i] = buf[33]; + buf[35] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[35], buf[33]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[35]); + buf[34] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[34], buf[35]); + pre[2][i] = buf[34]; + buf[38] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[38], buf[34]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[38]); + buf[39] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[39], buf[38]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[39]); + buf[37] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[37], buf[39]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[37]); + buf[36] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[36], buf[37]); + pre[3][i] = buf[36]; + buf[44] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[44], buf[36]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[44]); + buf[45] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[45], buf[44]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[45]); + buf[47] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[47], buf[45]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[47]); + buf[46] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[46], buf[47]); + pre[2][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[2][i], buf[46]); + buf[42] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[42], buf[46]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[42]); + buf[43] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[43], buf[42]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[43]); + buf[41] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[41], buf[43]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[41]); + buf[40] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[40], buf[41]); + pre[4][i] = buf[40]; + buf[56] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[56], buf[40]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[56]); + buf[57] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[57], buf[56]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[57]); + buf[59] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[59], buf[57]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[59]); + buf[58] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[58], buf[59]); + pre[2][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[2][i], buf[58]); + buf[62] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[62], buf[58]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[62]); + buf[63] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[63], buf[62]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[63]); + buf[61] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[61], buf[63]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[61]); + buf[60] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[60], buf[61]); + pre[3][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[3][i], buf[60]); + buf[52] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[52], buf[60]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[52]); + buf[53] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[53], buf[52]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[53]); + buf[55] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[55], buf[53]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[55]); + buf[54] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[54], buf[55]); + pre[2][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[2][i], buf[54]); + buf[50] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[50], buf[54]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[50]); + buf[51] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[51], buf[50]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[51]); + buf[49] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[49], buf[51]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[49]); + buf[48] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[48], buf[49]); + pre[5][i] = buf[48]; + buf[16] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[16], buf[48]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[16]); + buf[17] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[17], buf[16]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[17]); + buf[19] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[19], buf[17]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[19]); + buf[18] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[18], buf[19]); + pre[2][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[2][i], buf[18]); + buf[22] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[22], buf[18]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[22]); + buf[23] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[23], buf[22]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[23]); + buf[21] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[21], buf[23]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[21]); + buf[20] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[20], buf[21]); + pre[3][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[3][i], buf[20]); + buf[28] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[28], buf[20]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[28]); + buf[29] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[29], buf[28]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[29]); + buf[31] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[31], buf[29]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[31]); + buf[30] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[30], buf[31]); + pre[2][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[2][i], buf[30]); + buf[26] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[26], buf[30]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[26]); + buf[27] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[27], buf[26]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[27]); + buf[25] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[25], buf[27]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[25]); + buf[24] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[24], buf[25]); + pre[4][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[4][i], buf[24]); + buf[8] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[8], buf[24]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[8]); + buf[9] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[9], buf[8]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[9]); + buf[11] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[11], buf[9]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[11]); + buf[10] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[10], buf[11]); + pre[2][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[2][i], buf[10]); + buf[14] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[14], buf[10]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[14]); + buf[15] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[15], buf[14]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[15]); + buf[13] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[13], buf[15]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[13]); + buf[12] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[12], buf[13]); + pre[3][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[3][i], buf[12]); + buf[4] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[4], buf[12]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[4]); + buf[5] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[5], buf[4]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[5]); + buf[7] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[7], buf[5]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[7]); + buf[6] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[6], buf[7]); + pre[2][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[2][i], buf[6]); + buf[2] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[2], buf[6]); + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[2]); + buf[3] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[3], buf[2]); + pre[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[1][i], buf[3]); + buf[1] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[1], buf[3]); + + pre[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(pre[0][i], buf[1]); + out[0][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(buf[0], buf[1]); + + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(out[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(tmp, pre[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out[1][b] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(out[1][b], tmp[b]); + } + } +} + +/* justifying the length of the output */ +static void postprocess(vec128 out[][GFBITS]) { + int i; + uint64_t v[2]; + + for (i = 0; i < 13; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(out[1][i], 0); + v[1] = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(out[1][i], 1); + + v[1] <<= (128 - SYS_T) * 2; + v[1] >>= (128 - SYS_T) * 2; + + out[1][i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6960119F_SSE_fft_tr(vec128 out[][GFBITS], vec128 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/crypto_kem/mceliece6960119f/sse/fft_tr.h b/crypto_kem/mceliece6960119f/sse/fft_tr.h new file mode 100644 index 00000000..09702f4e --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_FFT_TR_H +#define PQCLEAN_MCELIECE6960119F_SSE_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE6960119F_SSE_fft_tr(vec128 /*out*/[][GFBITS], vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6960119f/sse/gf.c b/crypto_kem/mceliece6960119f/sse/gf.c new file mode 100644 index 00000000..b731c0be --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/gf.c @@ -0,0 +1,203 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6960119F_SSE_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE6960119F_SSE_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6960119F_SSE_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6960119F_SSE_gf_inv(gf in) { + return PQCLEAN_MCELIECE6960119F_SSE_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6960119F_SSE_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[237]; + + for (i = 0; i < 237; i++) { + prod[i] = 0; + } + + for (i = 0; i < 119; i++) { + for (j = 0; j < 119; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6960119F_SSE_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 236; i >= 119; i--) { + prod[i - 117] ^= PQCLEAN_MCELIECE6960119F_SSE_gf_mul(prod[i], (gf) 6400); + prod[i - 119] ^= PQCLEAN_MCELIECE6960119F_SSE_gf_mul(prod[i], (gf) 3134); + } + + for (i = 0; i < 119; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6960119f/sse/gf.h b/crypto_kem/mceliece6960119f/sse/gf.h new file mode 100644 index 00000000..99507f1c --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_GF_H +#define PQCLEAN_MCELIECE6960119F_SSE_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6960119F_SSE_gf_iszero(gf a); +gf PQCLEAN_MCELIECE6960119F_SSE_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE6960119F_SSE_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE6960119F_SSE_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE6960119F_SSE_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE6960119F_SSE_gf_inv(gf in); + +void PQCLEAN_MCELIECE6960119F_SSE_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece6960119f/sse/operations.c b/crypto_kem/mceliece6960119f/sse/operations.c new file mode 100644 index 00000000..1f084078 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6960119F_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6960119F_SSE_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119F_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6960119F_SSE_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119F_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6960119F_SSE_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6960119F_SSE_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6960119F_SSE_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6960119F_SSE_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6960119F_SSE_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6960119F_SSE_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6960119F_SSE_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6960119F_SSE_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119f/sse/params.h b/crypto_kem/mceliece6960119f/sse/params.h new file mode 100644 index 00000000..f3020f20 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_PARAMS_H +#define PQCLEAN_MCELIECE6960119F_SSE_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6960 +#define SYS_T 119 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6960119f/sse/pk_gen.c b/crypto_kem/mceliece6960119f/sse/pk_gen.c new file mode 100644 index 00000000..23b541e7 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/pk_gen.c @@ -0,0 +1,359 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec128 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 128 + 0 * 64 + r] <<= 1; + out[i * 128 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 128 + 1 * 64 + r] <<= 1; + out[i * 128 + 1 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec128 out0[][GFBITS], vec128 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[2] = {0}; + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(u[0], u[1]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(u[0], u[1]); + } + } +} + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 127) / 128) * 2 ], uint32_t *perm) { + int i, j, k, s, block_idx, row, tail; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + tail = row % 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> tail) | + (mat[ row + i ][ block_idx + 1 ] << (64 - tail)); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> tail) | + (mat[ i + j ][ block_idx + 1 ] << (64 - tail)); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << (64 - tail) >> (64 - tail)) | (buf[j] << tail); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> tail << tail) | (buf[j] >> (64 - tail)); + } + } + + return 0; +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 127) / 128) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE6960119F_SSE_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS1_I - 1; + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 2 ]; + + uint64_t mask; + + vec128 irr_int[ GFBITS ]; + + vec128 consts[64][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + vec128 prod[ 64 ][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ NBLOCKS2_H * 2 ]; + + // compute the inverses + + PQCLEAN_MCELIECE6960119F_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6960119F_SSE_fft(eval, irr_int); + + PQCLEAN_MCELIECE6960119F_SSE_vec128_copy(prod[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119F_SSE_vec128_inv(tmp, prod[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119F_SSE_vec128_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6960119F_SSE_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[k] = mat[ row ][k]; + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + one_row[k] = (one_row[k] >> tail) | (one_row[k + 1] << (64 - tail)); + PQCLEAN_MCELIECE6960119F_SSE_store8(pk, one_row[k]); + pk += 8; + } + + one_row[k] >>= tail; + PQCLEAN_MCELIECE6960119F_SSE_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk[ (PK_ROW_BYTES % 8) - 1 ] &= (1 << (PK_NCOLS % 8)) - 1; // removing redundant bits + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece6960119f/sse/pk_gen.h b/crypto_kem/mceliece6960119f/sse/pk_gen.h new file mode 100644 index 00000000..215875a9 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_PK_GEN_H +#define PQCLEAN_MCELIECE6960119F_SSE_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6960119F_SSE_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/sse/scalars_2x.inc b/crypto_kem/mceliece6960119f/sse/scalars_2x.inc new file mode 100644 index 00000000..79d1fe1c --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece6960119f/sse/scalars_4x.inc b/crypto_kem/mceliece6960119f/sse/scalars_4x.inc new file mode 100644 index 00000000..2ac61f2d --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/scalars_4x.inc @@ -0,0 +1,181 @@ +{{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF), +}}, +{{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FF00000000FF, 0X0000FF000000FFFF), +}}, +{{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), +}}, +{{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), +}} + diff --git a/crypto_kem/mceliece6960119f/sse/sk_gen.c b/crypto_kem/mceliece6960119f/sse/sk_gen.c new file mode 100644 index 00000000..05ca7bcc --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6960119F_SSE_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6960119F_SSE_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6960119F_SSE_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6960119F_SSE_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6960119F_SSE_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6960119F_SSE_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6960119F_SSE_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6960119F_SSE_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119f/sse/sk_gen.h b/crypto_kem/mceliece6960119f/sse/sk_gen.h new file mode 100644 index 00000000..86b99297 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_SK_GEN_H +#define PQCLEAN_MCELIECE6960119F_SSE_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6960119F_SSE_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6960119F_SSE_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/sse/syndrome_asm.S b/crypto_kem/mceliece6960119f/sse/syndrome_asm.S new file mode 100644 index 00000000..50af277a --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/syndrome_asm.S @@ -0,0 +1,1311 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: reg128 pp + +# qhasm: reg128 ee + +# qhasm: reg128 ss + +# qhasm: int64 b0 + +# qhasm: int64 b1 + +# qhasm: int64 i + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: int64 tmp + +# qhasm: stack64 back + +# qhasm: int64 buf_ptr + +# qhasm: stack128 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119F_SSE_syndrome_asm +.global PQCLEAN_MCELIECE6960119F_SSE_syndrome_asm +_PQCLEAN_MCELIECE6960119F_SSE_syndrome_asm: +PQCLEAN_MCELIECE6960119F_SSE_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $64,%r11 +sub %r11,%rsp + +# qhasm: input_2 += 193 +# asm 1: add $193,tmp=int64#4 +# asm 2: movzbq 0(tmp=%rcx +movzbq 0(%rdx),%rcx + +# qhasm: back = tmp +# asm 1: movq back=stack64#1 +# asm 2: movq back=32(%rsp) +movq %rcx,32(%rsp) + +# qhasm: i = 0 +# asm 1: mov $0,>i=int64#4 +# asm 2: mov $0,>i=%rcx +mov $0,%rcx + +# qhasm: inner1: +._inner1: + +# qhasm: addr = input_2 + i +# asm 1: lea (addr=int64#5 +# asm 2: lea (addr=%r8 +lea (%rdx,%rcx),%r8 + +# qhasm: b0 = *(uint8 *) (addr + 0) +# asm 1: movzbq 0(b0=int64#6 +# asm 2: movzbq 0(b0=%r9 +movzbq 0(%r8),%r9 + +# qhasm: b1 = *(uint8 *) (addr + 1) +# asm 1: movzbq 1(b1=int64#7 +# asm 2: movzbq 1(b1=%rax +movzbq 1(%r8),%rax + +# qhasm: (uint64) b0 >>= 3 +# asm 1: shr $3,b0=int64#4 +# asm 2: movzbq 1(b0=%rcx +movzbq 1(%r8),%rcx + +# qhasm: (uint64) b0 >>= 3 +# asm 1: shr $3,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1547 +# asm 1: mov $1547,>row=int64#5 +# asm 2: mov $1547,>row=%r8 +mov $1547,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rsi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(ee=reg128#2 +# asm 2: movdqu 0(ee=%xmm1 +movdqu 0(%rdx),%xmm1 + +# qhasm: ss &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 16(pp=%xmm1 +movdqu 16(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 16 ] +# asm 1: movdqu 16(ee=reg128#3 +# asm 2: movdqu 16(ee=%xmm2 +movdqu 16(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 32(pp=%xmm1 +movdqu 32(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 32 ] +# asm 1: movdqu 32(ee=reg128#3 +# asm 2: movdqu 32(ee=%xmm2 +movdqu 32(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 48(pp=%xmm1 +movdqu 48(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 48 ] +# asm 1: movdqu 48(ee=reg128#3 +# asm 2: movdqu 48(ee=%xmm2 +movdqu 48(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 64(pp=%xmm1 +movdqu 64(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 64 ] +# asm 1: movdqu 64(ee=reg128#3 +# asm 2: movdqu 64(ee=%xmm2 +movdqu 64(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 80(pp=%xmm1 +movdqu 80(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 80 ] +# asm 1: movdqu 80(ee=reg128#3 +# asm 2: movdqu 80(ee=%xmm2 +movdqu 80(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 96(pp=%xmm1 +movdqu 96(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 96 ] +# asm 1: movdqu 96(ee=reg128#3 +# asm 2: movdqu 96(ee=%xmm2 +movdqu 96(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 112(pp=%xmm1 +movdqu 112(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 112 ] +# asm 1: movdqu 112(ee=reg128#3 +# asm 2: movdqu 112(ee=%xmm2 +movdqu 112(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 128(pp=%xmm1 +movdqu 128(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 128 ] +# asm 1: movdqu 128(ee=reg128#3 +# asm 2: movdqu 128(ee=%xmm2 +movdqu 128(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 144(pp=%xmm1 +movdqu 144(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 144 ] +# asm 1: movdqu 144(ee=reg128#3 +# asm 2: movdqu 144(ee=%xmm2 +movdqu 144(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 160(pp=%xmm1 +movdqu 160(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 160 ] +# asm 1: movdqu 160(ee=reg128#3 +# asm 2: movdqu 160(ee=%xmm2 +movdqu 160(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 176(pp=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 176 ] +# asm 1: movdqu 176(ee=reg128#3 +# asm 2: movdqu 176(ee=%xmm2 +movdqu 176(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 192(pp=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 192 ] +# asm 1: movdqu 192(ee=reg128#3 +# asm 2: movdqu 192(ee=%xmm2 +movdqu 192(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 208(pp=%xmm1 +movdqu 208(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 208 ] +# asm 1: movdqu 208(ee=reg128#3 +# asm 2: movdqu 208(ee=%xmm2 +movdqu 208(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 224(pp=%xmm1 +movdqu 224(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 224 ] +# asm 1: movdqu 224(ee=reg128#3 +# asm 2: movdqu 224(ee=%xmm2 +movdqu 224(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 240(pp=%xmm1 +movdqu 240(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 240 ] +# asm 1: movdqu 240(ee=reg128#3 +# asm 2: movdqu 240(ee=%xmm2 +movdqu 240(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 256(pp=%xmm1 +movdqu 256(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 256 ] +# asm 1: movdqu 256(ee=reg128#3 +# asm 2: movdqu 256(ee=%xmm2 +movdqu 256(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 272(pp=%xmm1 +movdqu 272(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 272 ] +# asm 1: movdqu 272(ee=reg128#3 +# asm 2: movdqu 272(ee=%xmm2 +movdqu 272(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 288(pp=%xmm1 +movdqu 288(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 288 ] +# asm 1: movdqu 288(ee=reg128#3 +# asm 2: movdqu 288(ee=%xmm2 +movdqu 288(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 304(pp=%xmm1 +movdqu 304(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 304 ] +# asm 1: movdqu 304(ee=reg128#3 +# asm 2: movdqu 304(ee=%xmm2 +movdqu 304(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 320(pp=%xmm1 +movdqu 320(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 320 ] +# asm 1: movdqu 320(ee=reg128#3 +# asm 2: movdqu 320(ee=%xmm2 +movdqu 320(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 336(pp=%xmm1 +movdqu 336(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 336 ] +# asm 1: movdqu 336(ee=reg128#3 +# asm 2: movdqu 336(ee=%xmm2 +movdqu 336(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 352(pp=%xmm1 +movdqu 352(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 352 ] +# asm 1: movdqu 352(ee=reg128#3 +# asm 2: movdqu 352(ee=%xmm2 +movdqu 352(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 368(pp=%xmm1 +movdqu 368(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 368 ] +# asm 1: movdqu 368(ee=reg128#3 +# asm 2: movdqu 368(ee=%xmm2 +movdqu 368(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 384(pp=%xmm1 +movdqu 384(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 384 ] +# asm 1: movdqu 384(ee=reg128#3 +# asm 2: movdqu 384(ee=%xmm2 +movdqu 384(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 400(pp=%xmm1 +movdqu 400(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 400 ] +# asm 1: movdqu 400(ee=reg128#3 +# asm 2: movdqu 400(ee=%xmm2 +movdqu 400(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 416(pp=%xmm1 +movdqu 416(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 416 ] +# asm 1: movdqu 416(ee=reg128#3 +# asm 2: movdqu 416(ee=%xmm2 +movdqu 416(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 432(pp=%xmm1 +movdqu 432(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 432 ] +# asm 1: movdqu 432(ee=reg128#3 +# asm 2: movdqu 432(ee=%xmm2 +movdqu 432(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 448(pp=%xmm1 +movdqu 448(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 448 ] +# asm 1: movdqu 448(ee=reg128#3 +# asm 2: movdqu 448(ee=%xmm2 +movdqu 448(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 464(pp=%xmm1 +movdqu 464(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 464 ] +# asm 1: movdqu 464(ee=reg128#3 +# asm 2: movdqu 464(ee=%xmm2 +movdqu 464(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 480(pp=%xmm1 +movdqu 480(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 480 ] +# asm 1: movdqu 480(ee=reg128#3 +# asm 2: movdqu 480(ee=%xmm2 +movdqu 480(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 496(pp=%xmm1 +movdqu 496(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 496 ] +# asm 1: movdqu 496(ee=reg128#3 +# asm 2: movdqu 496(ee=%xmm2 +movdqu 496(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 512(pp=%xmm1 +movdqu 512(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 512 ] +# asm 1: movdqu 512(ee=reg128#3 +# asm 2: movdqu 512(ee=%xmm2 +movdqu 512(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 528(pp=%xmm1 +movdqu 528(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 528 ] +# asm 1: movdqu 528(ee=reg128#3 +# asm 2: movdqu 528(ee=%xmm2 +movdqu 528(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 544(pp=%xmm1 +movdqu 544(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 544 ] +# asm 1: movdqu 544(ee=reg128#3 +# asm 2: movdqu 544(ee=%xmm2 +movdqu 544(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 560(pp=%xmm1 +movdqu 560(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 560 ] +# asm 1: movdqu 560(ee=reg128#3 +# asm 2: movdqu 560(ee=%xmm2 +movdqu 560(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 576(pp=%xmm1 +movdqu 576(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 576 ] +# asm 1: movdqu 576(ee=reg128#3 +# asm 2: movdqu 576(ee=%xmm2 +movdqu 576(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 592(pp=%xmm1 +movdqu 592(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 592 ] +# asm 1: movdqu 592(ee=reg128#3 +# asm 2: movdqu 592(ee=%xmm2 +movdqu 592(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 608(pp=%xmm1 +movdqu 608(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 608 ] +# asm 1: movdqu 608(ee=reg128#3 +# asm 2: movdqu 608(ee=%xmm2 +movdqu 608(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 624(pp=%xmm1 +movdqu 624(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 624 ] +# asm 1: movdqu 624(ee=reg128#3 +# asm 2: movdqu 624(ee=%xmm2 +movdqu 624(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 640(pp=%xmm1 +movdqu 640(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 640 ] +# asm 1: movdqu 640(ee=reg128#3 +# asm 2: movdqu 640(ee=%xmm2 +movdqu 640(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 656(pp=%xmm1 +movdqu 656(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 656 ] +# asm 1: movdqu 656(ee=reg128#3 +# asm 2: movdqu 656(ee=%xmm2 +movdqu 656(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand buf=stack128#1 +# asm 2: movdqa buf=0(%rsp) +movdqa %xmm0,0(%rsp) + +# qhasm: s = *(uint32 *) (input_1 + 672) +# asm 1: movl 672(s=int64#6d +# asm 2: movl 672(s=%r9d +movl 672(%rsi),%r9d + +# qhasm: e = *(uint32 *) (input_2 + 672) +# asm 1: movl 672(e=int64#7d +# asm 2: movl 672(e=%eax +movl 672(%rdx),%eax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movzbq 676(p=%rax +movzbq 676(%rsi),%rax + +# qhasm: e = *(uint8 *) (input_2 + 676) +# asm 1: movzbq 676(e=int64#8 +# asm 2: movzbq 676(e=%r10 +movzbq 676(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,i=int64#2 +# asm 2: mov $676,>i=%rsi +mov $676,%rsi + +# qhasm: inner2: +._inner2: + +# qhasm: i -= 1 +# asm 1: sub $1,addr=int64#4 +# asm 2: lea (addr=%rcx +lea (%rdx,%rsi),%rcx + +# qhasm: b0 = *(uint8 *) (addr + 0) +# asm 1: movzbq 0(b0=int64#5 +# asm 2: movzbq 0(b0=%r8 +movzbq 0(%rcx),%r8 + +# qhasm: b1 = *(uint8 *) (addr + 1) +# asm 1: movzbq 1(b1=int64#6 +# asm 2: movzbq 1(b1=%r9 +movzbq 1(%rcx),%r9 + +# qhasm: (uint64) b0 >>= 5 +# asm 1: shr $5,tmp=int64#2 +# asm 2: movq tmp=%rsi +movq 32(%rsp),%rsi + +# qhasm: *(uint8 *) (input_2 + 0) = tmp +# asm 1: movb i=int64#2 +# asm 2: mov $0,>i=%rsi +mov $0,%rsi + +# qhasm: inner3: +._inner3: + +# qhasm: s = *(uint8 *) (input_0 + 0) +# asm 1: movzbq 0(s=int64#4 +# asm 2: movzbq 0(s=%rcx +movzbq 0(%rdi),%rcx + +# qhasm: e = *(uint8 *) (input_2 + 0) +# asm 1: movzbq 0(e=int64#5 +# asm 2: movzbq 0(e=%r8 +movzbq 0(%rdx),%r8 + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movzbq 0(s=%rsi +movzbq 0(%rdi),%rsi + +# qhasm: e = *(uint8 *) (input_2 + 0) +# asm 1: movzbq 0(e=int64#3 +# asm 2: movzbq 0(e=%rdx +movzbq 0(%rdx),%rdx + +# qhasm: (uint32) e &= 7 +# asm 1: and $7,mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6960119F_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6960119F_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6960119F_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6960119F_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6960119F_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6960119F_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6960119F_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6960119F_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6960119F_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6960119F_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6960119F_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#3 +# asm 2: movq 0(s0=%rdx +movq 0(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#4 +# asm 2: movq 8(s1=%rcx +movq 8(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 16(s0=%rdx +movq 16(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(s1=int64#4 +# asm 2: movq 24(s1=%rcx +movq 24(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 32(s0=%rdx +movq 32(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(s1=int64#4 +# asm 2: movq 40(s1=%rcx +movq 40(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 48(s0=%rdx +movq 48(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(s1=int64#4 +# asm 2: movq 56(s1=%rcx +movq 56(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 64(s0=%rdx +movq 64(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(s1=int64#4 +# asm 2: movq 72(s1=%rcx +movq 72(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 80(s0=%rdx +movq 80(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(s1=int64#4 +# asm 2: movq 88(s1=%rcx +movq 88(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 96(s0=%rdx +movq 96(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(s1=int64#4 +# asm 2: movq 104(s1=%rcx +movq 104(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 112(s0=%rdx +movq 112(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(s1=int64#4 +# asm 2: movq 120(s1=%rcx +movq 120(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 128(s0=%rdx +movq 128(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(s1=int64#4 +# asm 2: movq 136(s1=%rcx +movq 136(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 144(s0=%rdx +movq 144(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(s1=int64#4 +# asm 2: movq 152(s1=%rcx +movq 152(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 160(s0=%rdx +movq 160(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(s1=int64#4 +# asm 2: movq 168(s1=%rcx +movq 168(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 176(s0=%rdx +movq 176(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(s1=int64#4 +# asm 2: movq 184(s1=%rcx +movq 184(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 192(s0=%rdx +movq 192(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(s1=int64#4 +# asm 2: movq 200(s1=%rcx +movq 200(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE6960119F_SSE_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6960119F_SSE_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6960119F_SSE_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6960119F_SSE_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6960119F_SSE_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6960119F_SSE_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6960119F_SSE_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6960119F_SSE_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE6960119F_SSE_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x( PQCLEAN_MCELIECE6960119F_SSE_load8(in), PQCLEAN_MCELIECE6960119F_SSE_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE6960119F_SSE_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE6960119F_SSE_store8(out + 0, PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(in, 0)); + PQCLEAN_MCELIECE6960119F_SSE_store8(out + 8, PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece6960119f/sse/util.h b/crypto_kem/mceliece6960119f/sse/util.h new file mode 100644 index 00000000..8052ac61 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_UTIL_H +#define PQCLEAN_MCELIECE6960119F_SSE_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE6960119F_SSE_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE6960119F_SSE_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE6960119F_SSE_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE6960119F_SSE_load4(const unsigned char *src); +void PQCLEAN_MCELIECE6960119F_SSE_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE6960119F_SSE_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE6960119F_SSE_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE6960119F_SSE_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE6960119F_SSE_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece6960119f/sse/vec128.c b/crypto_kem/mceliece6960119f/sse/vec128.c new file mode 100644 index 00000000..5fe02f6d --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/vec128.c @@ -0,0 +1,152 @@ +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + +#include "vec128.h" + +#include "params.h" + + +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE6960119F_SSE_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE6960119F_SSE_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE6960119F_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE6960119F_SSE_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul_asm(h, f, g, 16); +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(vec128 *out, const vec128 *in) { + int i; + vec128 result[GFBITS], t; + + t = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[11], in[12]); + + result[0] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[0], in[11]); + result[1] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[7], t); + result[2] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[1], in[7]); + result[3] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[8], t); + result[4] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[2], in[7]); + result[4] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(result[4], in[8]); + result[4] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(result[4], t); + result[5] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[7], in[9]); + result[6] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[3], in[8]); + result[6] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(result[6], in[9]); + result[6] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(result[6], in[12]); + result[7] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[8], in[10]); + result[8] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[4], in[9]); + result[8] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(result[8], in[10]); + result[9] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[9], in[11]); + result[10] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[5], in[10]); + result[10] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(result[10], in[11]); + result[11] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[10], in[12]); + result[12] = PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(in[6], t); + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE6960119F_SSE_vec128_inv(vec128 *out, const vec128 *in) { + vec128 tmp_11[ GFBITS ]; + vec128 tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE6960119F_SSE_vec128_copy(out, in); + + PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(out, tmp_11); + PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(out, tmp_1111); + PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece6960119f/sse/vec128.h b/crypto_kem/mceliece6960119f/sse/vec128.h new file mode 100644 index 00000000..2901ad95 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/vec128.h @@ -0,0 +1,43 @@ +#ifndef PQCLEAN_MCELIECE6960119F_SSE_VEC128_H +#define PQCLEAN_MCELIECE6960119F_SSE_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE6960119F_SSE_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE6960119F_SSE_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE6960119F_SSE_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE6960119F_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE6960119F_SSE_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE6960119F_SSE_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119F_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +void PQCLEAN_MCELIECE6960119F_SSE_vec128_sq(vec128 *out, const vec128 *in); +void PQCLEAN_MCELIECE6960119F_SSE_vec128_inv(vec128 *out, const vec128 *in); +#endif diff --git a/crypto_kem/mceliece6960119f/sse/vec128_mul_asm.S b/crypto_kem/mceliece6960119f/sse/vec128_mul_asm.S new file mode 100644 index 00000000..e0dbdaa5 --- /dev/null +++ b/crypto_kem/mceliece6960119f/sse/vec128_mul_asm.S @@ -0,0 +1,2127 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 a0 + +# qhasm: reg128 a1 + +# qhasm: reg128 a2 + +# qhasm: reg128 a3 + +# qhasm: reg128 a4 + +# qhasm: reg128 a5 + +# qhasm: reg128 a6 + +# qhasm: reg128 a7 + +# qhasm: reg128 a8 + +# qhasm: reg128 a9 + +# qhasm: reg128 a10 + +# qhasm: reg128 a11 + +# qhasm: reg128 a12 + +# qhasm: reg128 b0 + +# qhasm: reg128 b1 + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 r5 + +# qhasm: reg128 r6 + +# qhasm: reg128 r7 + +# qhasm: reg128 r8 + +# qhasm: reg128 r9 + +# qhasm: reg128 r10 + +# qhasm: reg128 r11 + +# qhasm: reg128 r12 + +# qhasm: reg128 r13 + +# qhasm: reg128 r14 + +# qhasm: reg128 r15 + +# qhasm: reg128 r16 + +# qhasm: reg128 r17 + +# qhasm: reg128 r18 + +# qhasm: reg128 r19 + +# qhasm: reg128 r20 + +# qhasm: reg128 r21 + +# qhasm: reg128 r22 + +# qhasm: reg128 r23 + +# qhasm: reg128 r24 + +# qhasm: reg128 r + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119F_SSE_vec128_mul_asm +.global PQCLEAN_MCELIECE6960119F_SSE_vec128_mul_asm +_PQCLEAN_MCELIECE6960119F_SSE_vec128_mul_asm: +PQCLEAN_MCELIECE6960119F_SSE_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(b0=reg128#1 +# asm 2: movdqu 0(b0=%xmm0 +movdqu 0(%rdx),%xmm0 + +# qhasm: a12 = mem128[ input_1 + 192 ] +# asm 1: movdqu 192(a12=reg128#2 +# asm 2: movdqu 192(a12=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg128#3 +# asm 2: vpand r12=%xmm2 +vpand %xmm0,%xmm1,%xmm2 + +# qhasm: r13 = a12 & mem128[input_2 + 16] +# asm 1: vpand 16(r13=reg128#4 +# asm 2: vpand 16(r13=%xmm3 +vpand 16(%rdx),%xmm1,%xmm3 + +# qhasm: r14 = a12 & mem128[input_2 + 32] +# asm 1: vpand 32(r14=reg128#5 +# asm 2: vpand 32(r14=%xmm4 +vpand 32(%rdx),%xmm1,%xmm4 + +# qhasm: r15 = a12 & mem128[input_2 + 48] +# asm 1: vpand 48(r15=reg128#6 +# asm 2: vpand 48(r15=%xmm5 +vpand 48(%rdx),%xmm1,%xmm5 + +# qhasm: r16 = a12 & mem128[input_2 + 64] +# asm 1: vpand 64(r16=reg128#7 +# asm 2: vpand 64(r16=%xmm6 +vpand 64(%rdx),%xmm1,%xmm6 + +# qhasm: r17 = a12 & mem128[input_2 + 80] +# asm 1: vpand 80(r17=reg128#8 +# asm 2: vpand 80(r17=%xmm7 +vpand 80(%rdx),%xmm1,%xmm7 + +# qhasm: r18 = a12 & mem128[input_2 + 96] +# asm 1: vpand 96(r18=reg128#9 +# asm 2: vpand 96(r18=%xmm8 +vpand 96(%rdx),%xmm1,%xmm8 + +# qhasm: r19 = a12 & mem128[input_2 + 112] +# asm 1: vpand 112(r19=reg128#10 +# asm 2: vpand 112(r19=%xmm9 +vpand 112(%rdx),%xmm1,%xmm9 + +# qhasm: r20 = a12 & mem128[input_2 + 128] +# asm 1: vpand 128(r20=reg128#11 +# asm 2: vpand 128(r20=%xmm10 +vpand 128(%rdx),%xmm1,%xmm10 + +# qhasm: r21 = a12 & mem128[input_2 + 144] +# asm 1: vpand 144(r21=reg128#12 +# asm 2: vpand 144(r21=%xmm11 +vpand 144(%rdx),%xmm1,%xmm11 + +# qhasm: r22 = a12 & mem128[input_2 + 160] +# asm 1: vpand 160(r22=reg128#13 +# asm 2: vpand 160(r22=%xmm12 +vpand 160(%rdx),%xmm1,%xmm12 + +# qhasm: r23 = a12 & mem128[input_2 + 176] +# asm 1: vpand 176(r23=reg128#14 +# asm 2: vpand 176(r23=%xmm13 +vpand 176(%rdx),%xmm1,%xmm13 + +# qhasm: r24 = a12 & mem128[input_2 + 192] +# asm 1: vpand 192(r24=reg128#2 +# asm 2: vpand 192(r24=%xmm1 +vpand 192(%rdx),%xmm1,%xmm1 + +# qhasm: r15 ^= r24 +# asm 1: pxor r11=reg128#2 +# asm 2: movdqa r11=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: a11 = mem128[ input_1 + 176 ] +# asm 1: movdqu 176(a11=reg128#15 +# asm 2: movdqu 176(a11=%xmm14 +movdqu 176(%rsi),%xmm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r22 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r23 ^= r +# asm 1: pxor r10=reg128#14 +# asm 2: movdqa r10=%xmm13 +movdqa %xmm13,%xmm13 + +# qhasm: a10 = mem128[ input_1 + 160 ] +# asm 1: movdqu 160(a10=reg128#15 +# asm 2: movdqu 160(a10=%xmm14 +movdqu 160(%rsi),%xmm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r22 ^= r +# asm 1: pxor r9=reg128#13 +# asm 2: movdqa r9=%xmm12 +movdqa %xmm12,%xmm12 + +# qhasm: a9 = mem128[ input_1 + 144 ] +# asm 1: movdqu 144(a9=reg128#15 +# asm 2: movdqu 144(a9=%xmm14 +movdqu 144(%rsi),%xmm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r21 ^= r +# asm 1: pxor r8=reg128#12 +# asm 2: movdqa r8=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: a8 = mem128[ input_1 + 128 ] +# asm 1: movdqu 128(a8=reg128#15 +# asm 2: movdqu 128(a8=%xmm14 +movdqu 128(%rsi),%xmm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r20 ^= r +# asm 1: pxor r7=reg128#11 +# asm 2: movdqa r7=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: a7 = mem128[ input_1 + 112 ] +# asm 1: movdqu 112(a7=reg128#15 +# asm 2: movdqu 112(a7=%xmm14 +movdqu 112(%rsi),%xmm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r6=reg128#10 +# asm 2: movdqa r6=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: a6 = mem128[ input_1 + 96 ] +# asm 1: movdqu 96(a6=reg128#15 +# asm 2: movdqu 96(a6=%xmm14 +movdqu 96(%rsi),%xmm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r5=reg128#9 +# asm 2: movdqa r5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: a5 = mem128[ input_1 + 80 ] +# asm 1: movdqu 80(a5=reg128#15 +# asm 2: movdqu 80(a5=%xmm14 +movdqu 80(%rsi),%xmm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r4=reg128#8 +# asm 2: movdqa r4=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: a4 = mem128[ input_1 + 64 ] +# asm 1: movdqu 64(a4=reg128#15 +# asm 2: movdqu 64(a4=%xmm14 +movdqu 64(%rsi),%xmm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r3=reg128#7 +# asm 2: movdqa r3=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: a3 = mem128[ input_1 + 48 ] +# asm 1: movdqu 48(a3=reg128#15 +# asm 2: movdqu 48(a3=%xmm14 +movdqu 48(%rsi),%xmm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r2=reg128#6 +# asm 2: movdqa r2=%xmm5 +movdqa %xmm5,%xmm5 + +# qhasm: a2 = mem128[ input_1 + 32 ] +# asm 1: movdqu 32(a2=reg128#15 +# asm 2: movdqu 32(a2=%xmm14 +movdqu 32(%rsi),%xmm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r1=reg128#5 +# asm 2: movdqa r1=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: a1 = mem128[ input_1 + 16 ] +# asm 1: movdqu 16(a1=reg128#15 +# asm 2: movdqu 16(a1=%xmm14 +movdqu 16(%rsi),%xmm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r0=reg128#4 +# asm 2: movdqa r0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: a0 = mem128[ input_1 + 0 ] +# asm 1: movdqu 0(a0=reg128#15 +# asm 2: movdqu 0(a0=%xmm14 +movdqu 0(%rsi),%xmm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm0,%xmm14,%xmm0 + +# qhasm: r0 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 16(r=%xmm0 +vpand 16(%rdx),%xmm14,%xmm0 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 32(r=%xmm0 +vpand 32(%rdx),%xmm14,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 48(r=%xmm0 +vpand 48(%rdx),%xmm14,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 64(r=%xmm0 +vpand 64(%rdx),%xmm14,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 80(r=%xmm0 +vpand 80(%rdx),%xmm14,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 96(r=%xmm0 +vpand 96(%rdx),%xmm14,%xmm0 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 112(r=%xmm0 +vpand 112(%rdx),%xmm14,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 128(r=%xmm0 +vpand 128(%rdx),%xmm14,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 144(r=%xmm0 +vpand 144(%rdx),%xmm14,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 160(r=%xmm0 +vpand 160(%rdx),%xmm14,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 176(r=%xmm0 +vpand 176(%rdx),%xmm14,%xmm0 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 192(r=%xmm0 +vpand 192(%rdx),%xmm14,%xmm0 + +# qhasm: r12 ^= r +# asm 1: pxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6960119F_VEC_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece6960119f/vec/api.h b/crypto_kem/mceliece6960119f/vec/api.h new file mode 100644 index 00000000..127da5f5 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_API_H +#define PQCLEAN_MCELIECE6960119F_VEC_API_H + +#include + +#define PQCLEAN_MCELIECE6960119F_VEC_CRYPTO_ALGNAME "Classic McEliece 6960119f" +#define PQCLEAN_MCELIECE6960119F_VEC_CRYPTO_PUBLICKEYBYTES 1047319 +#define PQCLEAN_MCELIECE6960119F_VEC_CRYPTO_SECRETKEYBYTES 13908 +#define PQCLEAN_MCELIECE6960119F_VEC_CRYPTO_CIPHERTEXTBYTES 226 +#define PQCLEAN_MCELIECE6960119F_VEC_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif diff --git a/crypto_kem/mceliece6960119f/vec/benes.c b/crypto_kem/mceliece6960119f/vec/benes.c new file mode 100644 index 00000000..cfe8b7f4 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/benes.c @@ -0,0 +1,147 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" +#include "vec.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6960119F_VEC_benes(vec *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = r[i * 2 + 0]; + r_int_v[1][i] = r[i * 2 + 1]; + } + + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE6960119F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + r[i * 2 + 0] = r_int_v[0][i]; + r[i * 2 + 1] = r_int_v[1][i]; + } +} + diff --git a/crypto_kem/mceliece6960119f/vec/benes.h b/crypto_kem/mceliece6960119f/vec/benes.h new file mode 100644 index 00000000..9adf5c20 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/benes.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_BENES_H +#define PQCLEAN_MCELIECE6960119F_VEC_BENES_H +/* + This file is for Benes network related functions +*/ + +#include + +void PQCLEAN_MCELIECE6960119F_VEC_benes(uint64_t * /*r*/, const unsigned char * /*bits*/, int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/vec/bm.c b/crypto_kem/mceliece6960119f/vec/bm.c new file mode 100644 index 00000000..78c86ddd --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/bm.c @@ -0,0 +1,239 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline void vec_cmov(vec *out, const vec *in, uint16_t mask) { + int i; + + vec m0, m1; + + m0 = PQCLEAN_MCELIECE6960119F_VEC_vec_set1_16b(mask); + m1 = ~m0; + + for (i = 0; i < GFBITS; i++) { + out[i] = (in[i] & m0) | (out[i] & m1); + out[i] = (in[i] & m0) | (out[i] & m1); + } +} + +static inline void interleave(vec *in, int idx0, int idx1, const vec *mask, int b) { + int s = 1 << b; + + vec x, y; + + x = (in[idx0] & mask[0]) | ((in[idx1] & mask[0]) << s); + y = ((in[idx0] & mask[1]) >> s) | (in[idx1] & mask[1]); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, const vec *in) { + int i, k; + + vec mask[4][2]; + vec buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = 0; + } + + mask[0][0] = PQCLEAN_MCELIECE6960119F_VEC_vec_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6960119F_VEC_vec_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6960119F_VEC_vec_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6960119F_VEC_vec_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6960119F_VEC_vec_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6960119F_VEC_vec_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6960119F_VEC_vec_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6960119F_VEC_vec_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ k * 16 + i ] = (buf[i] >> (k * 16)) & GFMASK; + } + } +} + +static void update(vec in[][GFBITS], const gf e) { + int i; + vec tmp; + + for (i = 0; i < GFBITS; i++) { + tmp = (e >> i) & 1; + + in[0][i] = (in[0][i] >> 1) | (in[1][i] << 63); + in[1][i] = (in[1][i] >> 1) | (tmp << 63); + } +} + +static inline gf vec_reduce(vec in[][GFBITS]) { + int i; + vec tmp; + gf ret = 0; + + for (i = GFBITS - 1; i >= 0; i--) { + tmp = in[0][i] ^ in[1][i]; + + tmp ^= tmp >> 32; + tmp ^= tmp >> 16; + tmp ^= tmp >> 8; + tmp ^= tmp >> 4; + tmp ^= tmp >> 2; + tmp ^= tmp >> 1; + + ret <<= 1; + ret |= tmp & 1; + } + + return ret; +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +//void bm(vec out[][ GFBITS ], vec in[][ GFBITS ]) +void PQCLEAN_MCELIECE6960119F_VEC_bm(vec out[][GFBITS], vec in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + + vec prod[2][GFBITS]; + vec interval[2][GFBITS]; + vec dd[2][GFBITS], bb[2][GFBITS]; + vec B[2][GFBITS], C[2][GFBITS]; + vec B_tmp[2][GFBITS], C_tmp[2][GFBITS]; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[ 64], in[1]); + get_coefs(&coefs[128], in[2]); + get_coefs(&coefs[192], in[3]); + + C[0][0] = 0; + C[1][0] = one << 63; + B[0][0] = 0; + B[1][0] = one << 62; + + for (i = 1; i < GFBITS; i++) { + C[0][i] = C[1][i] = B[0][i] = B[1][i] = 0; + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[0][i] = interval[1][i] = 0; + } + + for (N = 0; N < SYS_T * 2; N++) { + update(interval, coefs[N]); + + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(prod[0], C[0], interval[0]); + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(prod[1], C[1], interval[1]); + + d = vec_reduce(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[0][i] = dd[1][i] = PQCLEAN_MCELIECE6960119F_VEC_vec_setbits((d >> i) & 1); + bb[0][i] = bb[1][i] = PQCLEAN_MCELIECE6960119F_VEC_vec_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(B_tmp[0], dd[0], B[0]); + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(B_tmp[1], dd[1], B[1]); + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(C_tmp[0], bb[0], C[0]); + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(C_tmp[1], bb[1], C[1]); + + vec_cmov(B[0], C[0], mask); + vec_cmov(B[1], C[1], mask); + update(B, 0); + + for (i = 0; i < GFBITS; i++) { + C[0][i] = B_tmp[0][i] ^ C_tmp[0][i]; + C[1][i] = B_tmp[1][i] ^ C_tmp[1][i]; + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + out[0][i] = (C[0][i] >> 8) | (C[1][i] << 56); + out[1][i] = C[1][i] >> 8; + } +} + diff --git a/crypto_kem/mceliece6960119f/vec/bm.h b/crypto_kem/mceliece6960119f/vec/bm.h new file mode 100644 index 00000000..dbe7e8e8 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/bm.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_BM_H +#define PQCLEAN_MCELIECE6960119F_VEC_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE6960119F_VEC_bm(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6960119f/vec/consts.inc b/crypto_kem/mceliece6960119f/vec/consts.inc new file mode 100644 index 00000000..1875ca4d --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/consts.inc @@ -0,0 +1,1920 @@ +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x9999999966666666, + 0x3C3CC3C3C3C33C3C, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xCC33CC3333CC33CC, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0x00FFFF0000FFFF00 +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x6666666699999999, + 0xC3C33C3C3C3CC3C3, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x33CC33CCCC33CC33, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0xFF0000FFFF0000FF +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x9669966969966996, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x6996699696699669, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x6996699696699669, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x9669966969966996, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, diff --git a/crypto_kem/mceliece6960119f/vec/controlbits.c b/crypto_kem/mceliece6960119f/vec/controlbits.c new file mode 100644 index 00000000..bd95e7cc --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6960119F_VEC_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6960119F_VEC_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6960119F_VEC_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6960119F_VEC_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece6960119f/vec/controlbits.h b/crypto_kem/mceliece6960119f/vec/controlbits.h new file mode 100644 index 00000000..cd62e1c0 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_CONTROLBITS_H +#define PQCLEAN_MCELIECE6960119F_VEC_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6960119F_VEC_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6960119F_VEC_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece6960119f/vec/crypto_hash.h b/crypto_kem/mceliece6960119f/vec/crypto_hash.h new file mode 100644 index 00000000..9952c054 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6960119F_VEC_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece6960119f/vec/decrypt.c b/crypto_kem/mceliece6960119f/vec/decrypt.c new file mode 100644 index 00000000..68f2863e --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/decrypt.c @@ -0,0 +1,193 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" +#include "vec.h" + +#include + +static void scaling(vec out[][GFBITS], vec inv[][GFBITS], const unsigned char *sk, const vec *recv) { + int i, j; + + vec irr_int[2][ GFBITS ]; + vec eval[128][ GFBITS ]; + vec tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE6960119F_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6960119F_VEC_fft(eval, irr_int); + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE6960119F_VEC_vec_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6960119F_VEC_vec_copy(inv[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119F_VEC_vec_inv(tmp, inv[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119F_VEC_vec_copy(inv[0], tmp); + + // + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static void preprocess(vec *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + r[i - 1] &= (1 << ((GFBITS * SYS_T) % 8)) - 1; // throwing away redundant bits + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 128; i++) { + recv[i] = PQCLEAN_MCELIECE6960119F_VEC_load8(r + i * 8); + } +} + +static void postprocess(unsigned char *e, vec *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE6960119F_VEC_store8(error8 + i * 8, err[i]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec out[][GFBITS], vec inv[][GFBITS], const vec *recv) { + int i, j; + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static uint16_t weight_check(const unsigned char *e, const vec *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < (1 << GFBITS); i++) { + w0 += (error[i / 64] >> (i % 64)) & 1; + } + + for (i = 0; i < SYS_N; i++) { + w1 += (e[i / 8] >> (i % 8)) & 1; + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec s0[][ GFBITS ], vec s1[][ GFBITS ]) { + int i, j; + vec diff = 0; + + for (i = 0; i < 4; i++) { + for (j = 0; j < GFBITS; j++) { + diff |= (s0[i][j] ^ s1[i][j]); + } + } + + return (uint16_t)PQCLEAN_MCELIECE6960119F_VEC_vec_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6960119F_VEC_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec inv[ 128 ][ GFBITS ]; + vec scaled[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + + vec error[ 128 ]; + + vec s_priv[ 4 ][ GFBITS ]; + vec s_priv_cmp[ 4 ][ GFBITS ]; + vec locator[2][ GFBITS ]; + + vec recv[ 128 ]; + vec allone; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE6960119F_VEC_benes(recv, sk + IRR_BYTES, 1); + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE6960119F_VEC_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6960119F_VEC_bm(locator, s_priv); + + PQCLEAN_MCELIECE6960119F_VEC_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6960119F_VEC_vec_setbits(1); + + for (i = 0; i < 128; i++) { + error[i] = PQCLEAN_MCELIECE6960119F_VEC_vec_or_reduce(eval[i]); + error[i] ^= allone; + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE6960119F_VEC_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE6960119F_VEC_benes(error, sk + IRR_BYTES, 0); + + postprocess(e, error); + + check_weight = weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece6960119f/vec/decrypt.h b/crypto_kem/mceliece6960119f/vec/decrypt.h new file mode 100644 index 00000000..e180bb18 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_DECRYPT_H +#define PQCLEAN_MCELIECE6960119F_VEC_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6960119F_VEC_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/vec/encrypt.c b/crypto_kem/mceliece6960119f/vec/encrypt.c new file mode 100644 index 00000000..50247e67 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/encrypt.c @@ -0,0 +1,152 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include + +#include "gf.h" + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind[ SYS_T * 2 ]; + uint8_t *ind8 = (uint8_t *)ind; + uint32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes(ind8, sizeof(ind)); + for (i = 0; i < sizeof(ind); i += 2) { + ind[i / 2] = (uint16_t)ind8[i + 1] << 8 | ind8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind32[i] == ind32[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6960119F_VEC_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + unsigned char e_tmp[ SYS_N / 8 ]; + + uint64_t b; + + const uint8_t *pk_ptr8; + const uint8_t *e_ptr8 = e_tmp + SYND_BYTES - 1; + + int i, j, k, tail = (PK_NROWS % 8); + + // + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = e[i]; + } + + s[i - 1] &= (1 << tail) - 1; + + for (i = SYND_BYTES - 1; i < SYS_N / 8 - 1; i++) { + e_tmp[i] = (e[i] >> tail) | (e[i + 1] << (8 - tail)); + } + + e_tmp[i] = e[i] >> tail; + + for (i = 0; i < PK_NROWS; i++) { + pk_ptr8 = pk + PK_ROW_BYTES * i; + + b = 0; + for (j = 0; j < PK_NCOLS / 64; j++) { + b ^= PQCLEAN_MCELIECE6960119F_VEC_load8(pk_ptr8 + j * 8) & PQCLEAN_MCELIECE6960119F_VEC_load8(e_ptr8 + j * 8); + } + + for (k = 0; k < (PK_NCOLS % 64 + 7) / 8; k++) { + b ^= pk_ptr8[8 * j + k] & e_ptr8[8 * j + k]; + } + + b ^= b >> 32; + b ^= b >> 16; + b ^= b >> 8; + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] ^= (b << (i % 8)); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6960119F_VEC_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece6960119f/vec/encrypt.h b/crypto_kem/mceliece6960119f/vec/encrypt.h new file mode 100644 index 00000000..d96aeb47 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_ENCRYPT_H +#define PQCLEAN_MCELIECE6960119F_VEC_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6960119F_VEC_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/vec/fft.c b/crypto_kem/mceliece6960119f/vec/fft.c new file mode 100644 index 00000000..4d8f41e1 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/fft.c @@ -0,0 +1,269 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec in[][GFBITS]) { + int i, j, k; + + const vec mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const vec s[5][2][GFBITS] = { +#include "scalars_2x.inc" + }; + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[1][i] >> 32; + in[0][i] ^= in[1][i] << 32; + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[0][i] ^= (in[0][i] & mask[k][0]) >> (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) >> (1 << k); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(in[0], in[0], s[j][0]); + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(in[1], in[1], s[j][1]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[8][ GFBITS ]; + vec buf[128]; + + uint64_t consts_ptr = 2; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[7] = {2522, 7827, 7801, 8035, 6897, 8167, 3476}; + + // + + for (i = 0; i < 7; i++) { + for (j = 0; j < GFBITS; j++) { + pre[i][j] = (beta[i] >> j) & 1; + pre[i][j] = -pre[i][j]; + } + + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(pre[i], in[1], pre[i]); + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = in[0][i]; + + buf[1] = buf[0] ^ pre[0][i]; + buf[32] = in[0][i] ^ pre[5][i]; + buf[3] = buf[1] ^ pre[1][i]; + buf[96] = buf[32] ^ pre[6][i]; + buf[97] = buf[96] ^ pre[0][i]; + buf[2] = in[0][i] ^ pre[1][i]; + buf[99] = buf[97] ^ pre[1][i]; + buf[6] = buf[2] ^ pre[2][i]; + buf[98] = buf[99] ^ pre[0][i]; + buf[7] = buf[6] ^ pre[0][i]; + buf[102] = buf[98] ^ pre[2][i]; + buf[5] = buf[7] ^ pre[1][i]; + buf[103] = buf[102] ^ pre[0][i]; + buf[101] = buf[103] ^ pre[1][i]; + buf[4] = in[0][i] ^ pre[2][i]; + buf[100] = buf[101] ^ pre[0][i]; + buf[12] = buf[4] ^ pre[3][i]; + buf[108] = buf[100] ^ pre[3][i]; + buf[13] = buf[12] ^ pre[0][i]; + buf[109] = buf[108] ^ pre[0][i]; + buf[15] = buf[13] ^ pre[1][i]; + buf[111] = buf[109] ^ pre[1][i]; + buf[14] = buf[15] ^ pre[0][i]; + buf[110] = buf[111] ^ pre[0][i]; + buf[10] = buf[14] ^ pre[2][i]; + buf[106] = buf[110] ^ pre[2][i]; + buf[11] = buf[10] ^ pre[0][i]; + buf[107] = buf[106] ^ pre[0][i]; + buf[9] = buf[11] ^ pre[1][i]; + buf[105] = buf[107] ^ pre[1][i]; + buf[104] = buf[105] ^ pre[0][i]; + buf[8] = in[0][i] ^ pre[3][i]; + buf[120] = buf[104] ^ pre[4][i]; + buf[24] = buf[8] ^ pre[4][i]; + buf[121] = buf[120] ^ pre[0][i]; + buf[25] = buf[24] ^ pre[0][i]; + buf[123] = buf[121] ^ pre[1][i]; + buf[27] = buf[25] ^ pre[1][i]; + buf[122] = buf[123] ^ pre[0][i]; + buf[26] = buf[27] ^ pre[0][i]; + buf[126] = buf[122] ^ pre[2][i]; + buf[30] = buf[26] ^ pre[2][i]; + buf[127] = buf[126] ^ pre[0][i]; + buf[31] = buf[30] ^ pre[0][i]; + buf[125] = buf[127] ^ pre[1][i]; + buf[29] = buf[31] ^ pre[1][i]; + buf[124] = buf[125] ^ pre[0][i]; + buf[28] = buf[29] ^ pre[0][i]; + buf[116] = buf[124] ^ pre[3][i]; + buf[20] = buf[28] ^ pre[3][i]; + buf[117] = buf[116] ^ pre[0][i]; + buf[21] = buf[20] ^ pre[0][i]; + buf[119] = buf[117] ^ pre[1][i]; + buf[23] = buf[21] ^ pre[1][i]; + buf[118] = buf[119] ^ pre[0][i]; + buf[22] = buf[23] ^ pre[0][i]; + buf[114] = buf[118] ^ pre[2][i]; + buf[18] = buf[22] ^ pre[2][i]; + buf[115] = buf[114] ^ pre[0][i]; + buf[19] = buf[18] ^ pre[0][i]; + buf[113] = buf[115] ^ pre[1][i]; + buf[17] = buf[19] ^ pre[1][i]; + buf[112] = buf[113] ^ pre[0][i]; + buf[80] = buf[112] ^ pre[5][i]; + buf[16] = in[0][i] ^ pre[4][i]; + buf[81] = buf[80] ^ pre[0][i]; + buf[48] = buf[16] ^ pre[5][i]; + buf[83] = buf[81] ^ pre[1][i]; + buf[49] = buf[48] ^ pre[0][i]; + buf[82] = buf[83] ^ pre[0][i]; + buf[51] = buf[49] ^ pre[1][i]; + buf[86] = buf[82] ^ pre[2][i]; + buf[50] = buf[51] ^ pre[0][i]; + buf[87] = buf[86] ^ pre[0][i]; + buf[54] = buf[50] ^ pre[2][i]; + buf[85] = buf[87] ^ pre[1][i]; + buf[55] = buf[54] ^ pre[0][i]; + buf[84] = buf[85] ^ pre[0][i]; + buf[53] = buf[55] ^ pre[1][i]; + buf[92] = buf[84] ^ pre[3][i]; + buf[52] = buf[53] ^ pre[0][i]; + buf[93] = buf[92] ^ pre[0][i]; + buf[60] = buf[52] ^ pre[3][i]; + buf[95] = buf[93] ^ pre[1][i]; + buf[61] = buf[60] ^ pre[0][i]; + buf[94] = buf[95] ^ pre[0][i]; + buf[63] = buf[61] ^ pre[1][i]; + buf[90] = buf[94] ^ pre[2][i]; + buf[62] = buf[63] ^ pre[0][i]; + buf[91] = buf[90] ^ pre[0][i]; + buf[58] = buf[62] ^ pre[2][i]; + buf[89] = buf[91] ^ pre[1][i]; + buf[59] = buf[58] ^ pre[0][i]; + buf[88] = buf[89] ^ pre[0][i]; + buf[57] = buf[59] ^ pre[1][i]; + buf[72] = buf[88] ^ pre[4][i]; + buf[56] = buf[57] ^ pre[0][i]; + buf[73] = buf[72] ^ pre[0][i]; + buf[40] = buf[56] ^ pre[4][i]; + buf[75] = buf[73] ^ pre[1][i]; + buf[41] = buf[40] ^ pre[0][i]; + buf[74] = buf[75] ^ pre[0][i]; + buf[43] = buf[41] ^ pre[1][i]; + buf[78] = buf[74] ^ pre[2][i]; + buf[42] = buf[43] ^ pre[0][i]; + buf[79] = buf[78] ^ pre[0][i]; + buf[46] = buf[42] ^ pre[2][i]; + buf[77] = buf[79] ^ pre[1][i]; + buf[47] = buf[46] ^ pre[0][i]; + buf[76] = buf[77] ^ pre[0][i]; + buf[45] = buf[47] ^ pre[1][i]; + buf[68] = buf[76] ^ pre[3][i]; + buf[44] = buf[45] ^ pre[0][i]; + buf[69] = buf[68] ^ pre[0][i]; + buf[36] = buf[44] ^ pre[3][i]; + buf[71] = buf[69] ^ pre[1][i]; + buf[37] = buf[36] ^ pre[0][i]; + buf[70] = buf[71] ^ pre[0][i]; + buf[39] = buf[37] ^ pre[1][i]; + buf[66] = buf[70] ^ pre[2][i]; + buf[38] = buf[39] ^ pre[0][i]; + buf[67] = buf[66] ^ pre[0][i]; + buf[34] = buf[38] ^ pre[2][i]; + buf[65] = buf[67] ^ pre[1][i]; + buf[35] = buf[34] ^ pre[0][i]; + buf[33] = buf[35] ^ pre[1][i]; + buf[64] = in[0][i] ^ pre[6][i]; + + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(buf + 0, buf + 0); + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(buf + 64, buf + 64); + + for (j = 0; j < 128; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 1; i <= 6; i++) { + s = 1 << i; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += ((uint64_t)1 << i); + } + + // adding the part contributed by x^128 + +// for (i = 0; i < 128; i++) +// for (b = 0; b < GFBITS; b++) +// out[i][b] ^= powers[i][b]; +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6960119F_VEC_fft(vec out[][GFBITS], vec in[][GFBITS]) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece6960119f/vec/fft.h b/crypto_kem/mceliece6960119f/vec/fft.h new file mode 100644 index 00000000..cd18799a --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_FFT_H +#define PQCLEAN_MCELIECE6960119F_VEC_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE6960119F_VEC_fft(vec out[][ GFBITS ], vec in[][GFBITS]); + +#endif + diff --git a/crypto_kem/mceliece6960119f/vec/fft_tr.c b/crypto_kem/mceliece6960119f/vec/fft_tr.c new file mode 100644 index 00000000..de7ef2b5 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/fft_tr.c @@ -0,0 +1,300 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec in[][ GFBITS ]) { + int i, j, k; + + const vec mask[6][2] = { + {0x2222222222222222, 0x4444444444444444}, + {0x0C0C0C0C0C0C0C0C, 0x3030303030303030}, + {0x00F000F000F000F0, 0x0F000F000F000F00}, + {0x0000FF000000FF00, 0x00FF000000FF0000}, + {0x00000000FFFF0000, 0x0000FFFF00000000}, + {0xFFFFFFFF00000000, 0x00000000FFFFFFFF} + }; + + const vec s[6][4][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(in[1], in[1], s[j][1]); // scaling + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(in[2], in[2], s[j][2]); // scaling + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(in[3], in[3], s[j][3]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + in[0][i] ^= (in[0][i] & mask[k][0]) << (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][0]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][1]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][0]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][1]) << (1 << k); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[0][i] >> 32; + in[1][i] ^= in[1][i] << 32; + + in[3][i] ^= in[2][i] >> 32; + in[3][i] ^= in[3][i] << 32; + } + } + + for (i = 0; i < GFBITS; i++) { + in[3][i] ^= in[2][i] ^= in[1][i]; + } + } +} + +static void butterflies_tr(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[6][2][ GFBITS ]; + vec buf[2][64]; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 128; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 6; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tmp[b]; + } + } + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 128; k++) { + (&buf[0][0])[ k ] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(buf[0], buf[0]); + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(buf[1], buf[1]); + + for (k = 0; k < 2; k++) { + pre[0][k][i] = buf[k][32]; + buf[k][33] ^= buf[k][32]; + pre[1][k][i] = buf[k][33]; + buf[k][35] ^= buf[k][33]; + pre[0][k][i] ^= buf[k][35]; + buf[k][34] ^= buf[k][35]; + pre[2][k][i] = buf[k][34]; + buf[k][38] ^= buf[k][34]; + pre[0][k][i] ^= buf[k][38]; + buf[k][39] ^= buf[k][38]; + pre[1][k][i] ^= buf[k][39]; + buf[k][37] ^= buf[k][39]; + pre[0][k][i] ^= buf[k][37]; + buf[k][36] ^= buf[k][37]; + pre[3][k][i] = buf[k][36]; + buf[k][44] ^= buf[k][36]; + pre[0][k][i] ^= buf[k][44]; + buf[k][45] ^= buf[k][44]; + pre[1][k][i] ^= buf[k][45]; + buf[k][47] ^= buf[k][45]; + pre[0][k][i] ^= buf[k][47]; + buf[k][46] ^= buf[k][47]; + pre[2][k][i] ^= buf[k][46]; + buf[k][42] ^= buf[k][46]; + pre[0][k][i] ^= buf[k][42]; + buf[k][43] ^= buf[k][42]; + pre[1][k][i] ^= buf[k][43]; + buf[k][41] ^= buf[k][43]; + pre[0][k][i] ^= buf[k][41]; + buf[k][40] ^= buf[k][41]; + pre[4][k][i] = buf[k][40]; + buf[k][56] ^= buf[k][40]; + pre[0][k][i] ^= buf[k][56]; + buf[k][57] ^= buf[k][56]; + pre[1][k][i] ^= buf[k][57]; + buf[k][59] ^= buf[k][57]; + pre[0][k][i] ^= buf[k][59]; + buf[k][58] ^= buf[k][59]; + pre[2][k][i] ^= buf[k][58]; + buf[k][62] ^= buf[k][58]; + pre[0][k][i] ^= buf[k][62]; + buf[k][63] ^= buf[k][62]; + pre[1][k][i] ^= buf[k][63]; + buf[k][61] ^= buf[k][63]; + pre[0][k][i] ^= buf[k][61]; + buf[k][60] ^= buf[k][61]; + pre[3][k][i] ^= buf[k][60]; + buf[k][52] ^= buf[k][60]; + pre[0][k][i] ^= buf[k][52]; + buf[k][53] ^= buf[k][52]; + pre[1][k][i] ^= buf[k][53]; + buf[k][55] ^= buf[k][53]; + pre[0][k][i] ^= buf[k][55]; + buf[k][54] ^= buf[k][55]; + pre[2][k][i] ^= buf[k][54]; + buf[k][50] ^= buf[k][54]; + pre[0][k][i] ^= buf[k][50]; + buf[k][51] ^= buf[k][50]; + pre[1][k][i] ^= buf[k][51]; + buf[k][49] ^= buf[k][51]; + pre[0][k][i] ^= buf[k][49]; + buf[k][48] ^= buf[k][49]; + pre[5][k][i] = buf[k][48]; + buf[k][16] ^= buf[k][48]; + pre[0][k][i] ^= buf[k][16]; + buf[k][17] ^= buf[k][16]; + pre[1][k][i] ^= buf[k][17]; + buf[k][19] ^= buf[k][17]; + pre[0][k][i] ^= buf[k][19]; + buf[k][18] ^= buf[k][19]; + pre[2][k][i] ^= buf[k][18]; + buf[k][22] ^= buf[k][18]; + pre[0][k][i] ^= buf[k][22]; + buf[k][23] ^= buf[k][22]; + pre[1][k][i] ^= buf[k][23]; + buf[k][21] ^= buf[k][23]; + pre[0][k][i] ^= buf[k][21]; + buf[k][20] ^= buf[k][21]; + pre[3][k][i] ^= buf[k][20]; + buf[k][28] ^= buf[k][20]; + pre[0][k][i] ^= buf[k][28]; + buf[k][29] ^= buf[k][28]; + pre[1][k][i] ^= buf[k][29]; + buf[k][31] ^= buf[k][29]; + pre[0][k][i] ^= buf[k][31]; + buf[k][30] ^= buf[k][31]; + pre[2][k][i] ^= buf[k][30]; + buf[k][26] ^= buf[k][30]; + pre[0][k][i] ^= buf[k][26]; + buf[k][27] ^= buf[k][26]; + pre[1][k][i] ^= buf[k][27]; + buf[k][25] ^= buf[k][27]; + pre[0][k][i] ^= buf[k][25]; + buf[k][24] ^= buf[k][25]; + pre[4][k][i] ^= buf[k][24]; + buf[k][8] ^= buf[k][24]; + pre[0][k][i] ^= buf[k][8]; + buf[k][9] ^= buf[k][8]; + pre[1][k][i] ^= buf[k][9]; + buf[k][11] ^= buf[k][9]; + pre[0][k][i] ^= buf[k][11]; + buf[k][10] ^= buf[k][11]; + pre[2][k][i] ^= buf[k][10]; + buf[k][14] ^= buf[k][10]; + pre[0][k][i] ^= buf[k][14]; + buf[k][15] ^= buf[k][14]; + pre[1][k][i] ^= buf[k][15]; + buf[k][13] ^= buf[k][15]; + pre[0][k][i] ^= buf[k][13]; + buf[k][12] ^= buf[k][13]; + pre[3][k][i] ^= buf[k][12]; + buf[k][4] ^= buf[k][12]; + pre[0][k][i] ^= buf[k][4]; + buf[k][5] ^= buf[k][4]; + pre[1][k][i] ^= buf[k][5]; + buf[k][7] ^= buf[k][5]; + pre[0][k][i] ^= buf[k][7]; + buf[k][6] ^= buf[k][7]; + pre[2][k][i] ^= buf[k][6]; + buf[k][2] ^= buf[k][6]; + pre[0][k][i] ^= buf[k][2]; + buf[k][3] ^= buf[k][2]; + pre[1][k][i] ^= buf[k][3]; + buf[k][1] ^= buf[k][3]; + + pre[0][k][i] ^= buf[k][1]; + out[k][i] = buf[k][0] ^ buf[k][1]; + } + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119F_VEC_vec_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(out[2], pre[0][0], tmp); + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(out[3], pre[0][1], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119F_VEC_vec_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(pre[i][0], pre[i][0], tmp); + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(pre[i][1], pre[i][1], tmp); + + for (b = 0; b < GFBITS; b++) { + out[2][b] ^= pre[i][0][b]; + out[3][b] ^= pre[i][1][b]; + } + } + +} + +/* justifying the length of the output */ +static void postprocess(vec out[4][GFBITS]) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[3][i] <<= (128 - SYS_T) * 2; + out[3][i] >>= (128 - SYS_T) * 2; + } +} + +void PQCLEAN_MCELIECE6960119F_VEC_fft_tr(vec out[][GFBITS], vec in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/crypto_kem/mceliece6960119f/vec/fft_tr.h b/crypto_kem/mceliece6960119f/vec/fft_tr.h new file mode 100644 index 00000000..5981e8a0 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_FFT_TR_H +#define PQCLEAN_MCELIECE6960119F_VEC_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE6960119F_VEC_fft_tr(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece6960119f/vec/gf.c b/crypto_kem/mceliece6960119f/vec/gf.c new file mode 100644 index 00000000..a72c47fe --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/gf.c @@ -0,0 +1,203 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6960119F_VEC_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE6960119F_VEC_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6960119F_VEC_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6960119F_VEC_gf_inv(gf in) { + return PQCLEAN_MCELIECE6960119F_VEC_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6960119F_VEC_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[237]; + + for (i = 0; i < 237; i++) { + prod[i] = 0; + } + + for (i = 0; i < 119; i++) { + for (j = 0; j < 119; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6960119F_VEC_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 236; i >= 119; i--) { + prod[i - 117] ^= PQCLEAN_MCELIECE6960119F_VEC_gf_mul(prod[i], (gf) 6400); + prod[i - 119] ^= PQCLEAN_MCELIECE6960119F_VEC_gf_mul(prod[i], (gf) 3134); + } + + for (i = 0; i < 119; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece6960119f/vec/gf.h b/crypto_kem/mceliece6960119f/vec/gf.h new file mode 100644 index 00000000..6c7eb43f --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/gf.h @@ -0,0 +1,20 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_GF_H +#define PQCLEAN_MCELIECE6960119F_VEC_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6960119F_VEC_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE6960119F_VEC_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE6960119F_VEC_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE6960119F_VEC_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE6960119F_VEC_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/vec/operations.c b/crypto_kem/mceliece6960119f/vec/operations.c new file mode 100644 index 00000000..9abcda53 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6960119F_VEC_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6960119F_VEC_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6960119F_VEC_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6960119F_VEC_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6960119F_VEC_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6960119F_VEC_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6960119F_VEC_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6960119F_VEC_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6960119F_VEC_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6960119F_VEC_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119f/vec/params.h b/crypto_kem/mceliece6960119f/vec/params.h new file mode 100644 index 00000000..490810d3 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_PARAMS_H +#define PQCLEAN_MCELIECE6960119F_VEC_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6960 +#define SYS_T 119 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece6960119f/vec/pk_gen.c b/crypto_kem/mceliece6960119f/vec/pk_gen.c new file mode 100644 index 00000000..c668685b --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/pk_gen.c @@ -0,0 +1,311 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec in[][GFBITS]) { + int i, j, r; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 0; r < 64; r++) { + out[i * 64 + r] <<= 1; + out[i * 64 + r] |= (in[i][j] >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec out0[][GFBITS], vec out1[][GFBITS], const uint64_t *in) { + int i, j, r; + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out1[i][j] <<= 1; + out1[i][j] |= (in[i * 64 + r] >> (j + GFBITS)) & 1; + } + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out0[i][GFBITS - 1 - j] <<= 1; + out0[i][GFBITS - 1 - j] |= (in[i * 64 + r] >> j) & 1; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + int i, b, m = 0, r = 0; + + for (i = 0; i < 64; i++) { + b = (int)(in >> i) & 1; + m |= b; + r += (m ^ 1) & (b ^ 1); + } + + return r; +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ (SYS_N + 63) / 64 ], uint32_t *perm) { + int i, j, k, s, block_idx, row, tail; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + tail = row % 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> tail) | + (mat[ row + i ][ block_idx + 1 ] << (64 - tail)); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> tail) | + (mat[ i + j ][ block_idx + 1 ] << (64 - tail)); + } + + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << (64 - tail) >> (64 - tail)) | (buf[j] << tail); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> tail << tail) | (buf[j] >> (64 - tail)); + } + } + + return 0; +} + +int PQCLEAN_MCELIECE6960119F_VEC_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { +#define NBLOCKS_H ((SYS_N + 63) / 64) +#define NBLOCKS_I ((GFBITS * SYS_T + 63) / 64) + const int block_idx = NBLOCKS_I - 1; + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS_H ]; + + uint64_t mask; + + vec irr_int[2][ GFBITS ]; + + vec consts[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + vec prod[ 128 ][ GFBITS ]; + vec tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE6960119F_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE6960119F_VEC_fft(eval, irr_int); + + PQCLEAN_MCELIECE6960119F_VEC_vec_copy(prod[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119F_VEC_vec_inv(tmp, prod[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119F_VEC_vec_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6960119F_VEC_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS_H; j++) { + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = block_idx; k < NBLOCKS_H - 1; k++) { + mat[row][k] = (mat[row][k] >> tail) | (mat[row][k + 1] << (64 - tail)); + PQCLEAN_MCELIECE6960119F_VEC_store8(pk, mat[row][k]); + pk += 8; + } + + mat[row][k] >>= tail; + PQCLEAN_MCELIECE6960119F_VEC_store_i(pk, mat[row][k], PK_ROW_BYTES % 8); + + pk[ (PK_ROW_BYTES % 8) - 1 ] &= (1 << (PK_NCOLS % 8)) - 1; // removing redundant bits + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece6960119f/vec/pk_gen.h b/crypto_kem/mceliece6960119f/vec/pk_gen.h new file mode 100644 index 00000000..8db4ec52 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_PK_GEN_H +#define PQCLEAN_MCELIECE6960119F_VEC_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6960119F_VEC_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/vec/scalars_2x.inc b/crypto_kem/mceliece6960119f/vec/scalars_2x.inc new file mode 100644 index 00000000..a0abb162 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/scalars_2x.inc @@ -0,0 +1,150 @@ +{{ + 0X3C3CF30C0000C003, + 0X0CCCC3F333C0000C, + 0X03C33F33FCC0C03C, + 0X0003000F3C03C0C0, + 0XF33FF33030CF03F0, + 0X0CF0303300F0CCC0, + 0XFF3F0C0CC0FF3CC0, + 0XCF3CF0FF003FC000, + 0XC00FF3CF0303F300, + 0X3CCC0CC00CF0CC00, + 0XF30FFC3C3FCCFC00, + 0X3F0FC3F0CCF0C000, + 0X3000FF33CCF0F000 +}, +{ + 0X0C0F0FCF0F0CF330, + 0XF0000FC33C3CCF3C, + 0X3C0F3F00C3C300FC, + 0X3C33CCC0F0F3CC30, + 0XC0CFFFFFCCCC30CC, + 0X3FC3F3CCFFFC033F, + 0XFC3030CCCCC0CFCF, + 0X0FCF0C00CCF333C3, + 0XCFFCF33000CFF030, + 0X00CFFCC330F30FCC, + 0X3CCC3FCCC0F3FFF3, + 0XF00F0C3FC003C0FF, + 0X330CCFCC03C0FC33 +}}, +{{ + 0X0F0F0FF0F000000F, + 0X00FFFFFFFF0000F0, + 0XFFFF00FF00000F00, + 0XFFF000F00F0FF000, + 0XFFF0000F0FF000F0, + 0X00FF000FFF000000, + 0XFF0F0FFF0F0FF000, + 0X0FFF0000000F0000, + 0X00F000F0FFF00F00, + 0X00F00FF00F00F000, + 0XFFF000F000F00000, + 0X00F00F000FF00000, + 0X0000FF0F0000F000 +}, +{ + 0XF0FFFFFFF0F00F00, + 0X00FFF0FFFF0000FF, + 0X00FF00000F0F0FFF, + 0XF000F0000F00FF0F, + 0XFF000000FFF00000, + 0XF0FF000FF00F0FF0, + 0X0F0F0F00FF000F0F, + 0X0F0F00F0F0F0F000, + 0X00F00F00F00F000F, + 0X00F0F0F00000FFF0, + 0XFFFFFF0FF00F0FFF, + 0X0F0FFFF00FFFFFFF, + 0XFFFF0F0FFF0FFF00 +}}, +{{ + 0X00FF0000000000FF, + 0XFFFFFFFFFF00FF00, + 0XFF0000FF00FF0000, + 0XFFFF000000FF0000, + 0XFF00000000FF0000, + 0X00FFFFFFFF000000, + 0XFF0000FFFFFF0000, + 0XFF00FF00FFFF0000, + 0X00FFFFFFFF00FF00, + 0XFFFF000000000000, + 0X00FF0000FF000000, + 0XFF00FF00FF000000, + 0X00FF00FFFF000000 +}, +{ + 0X00FF00FF00FF0000, + 0XFF00FFFF000000FF, + 0X0000FFFF000000FF, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF00FF, + 0X0000FFFF00FFFF00, + 0XFF00FF0000FFFF00, + 0X00000000FFFFFFFF, + 0X0000FF0000000000, + 0XFF00FFFF00FFFF00, + 0X00FFFF00000000FF, + 0X0000FF00FF00FFFF, + 0XFF0000FFFFFF0000 +}}, +{{ + 0X000000000000FFFF, + 0XFFFFFFFFFFFF0000, + 0X0000000000000000, + 0XFFFF0000FFFF0000, + 0XFFFFFFFFFFFF0000, + 0X0000FFFF00000000, + 0X0000FFFFFFFF0000, + 0XFFFF0000FFFF0000, + 0X0000FFFF00000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000FFFF00000000, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0X0000FFFF00000000, + 0XFFFF0000FFFF0000, + 0X0000FFFFFFFF0000, + 0X0000FFFF0000FFFF, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFF0000, + 0XFFFF0000FFFFFFFF, + 0XFFFF0000FFFFFFFF, + 0X0000000000000000 +}}, +{{ + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000 +}} diff --git a/crypto_kem/mceliece6960119f/vec/scalars_4x.inc b/crypto_kem/mceliece6960119f/vec/scalars_4x.inc new file mode 100644 index 00000000..cbaccec7 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/scalars_4x.inc @@ -0,0 +1,360 @@ +{{ + 0x3C3CF30C0000C003, + 0x0CCCC3F333C0000C, + 0x03C33F33FCC0C03C, + 0x0003000F3C03C0C0, + 0xF33FF33030CF03F0, + 0x0CF0303300F0CCC0, + 0xFF3F0C0CC0FF3CC0, + 0xCF3CF0FF003FC000, + 0xC00FF3CF0303F300, + 0x3CCC0CC00CF0CC00, + 0xF30FFC3C3FCCFC00, + 0x3F0FC3F0CCF0C000, + 0x3000FF33CCF0F000 +}, +{ + 0x0C0F0FCF0F0CF330, + 0xF0000FC33C3CCF3C, + 0x3C0F3F00C3C300FC, + 0x3C33CCC0F0F3CC30, + 0xC0CFFFFFCCCC30CC, + 0x3FC3F3CCFFFC033F, + 0xFC3030CCCCC0CFCF, + 0x0FCF0C00CCF333C3, + 0xCFFCF33000CFF030, + 0x00CFFCC330F30FCC, + 0x3CCC3FCCC0F3FFF3, + 0xF00F0C3FC003C0FF, + 0x330CCFCC03C0FC33 +}, +{ + 0xF0F30C33CF03F03F, + 0x00F30FC00C3300FF, + 0xF3CC3CF3F3FCF33F, + 0x3C0FC0FC303C3F3C, + 0xFC30CF303F3FF00F, + 0x33300C0CC3300CF3, + 0x3C030CF3F03FF3F3, + 0x3CCC03FCCC3FFC03, + 0x033C3C3CF0003FC3, + 0xFFC0FF00F0FF0F03, + 0xF3F30CF003FCC303, + 0x30CFCFC3CC0F3000, + 0x0CF30CCF3FCFCC0F +}, +{ + 0x3F30CC0C000F3FCC, + 0xFC3CF030FC3FFF03, + 0x33FFFCFF0CCF3CC3, + 0x003CFF33C3CC30CF, + 0xCFF3CF33C00F3003, + 0x00F3CC0CF3003CCF, + 0x3C000CFCCC3C3333, + 0xF3CF03C0FCF03FF0, + 0x3F3C3CF0C330330C, + 0x33CCFCC0FF0033F0, + 0x33C300C0F0C003F3, + 0x003FF0003F00C00C, + 0xCFF3C3033F030FFF +}}, +{{ + 0x0F0F0FF0F000000F, + 0x00FFFFFFFF0000F0, + 0xFFFF00FF00000F00, + 0xFFF000F00F0FF000, + 0xFFF0000F0FF000F0, + 0x00FF000FFF000000, + 0xFF0F0FFF0F0FF000, + 0x0FFF0000000F0000, + 0x00F000F0FFF00F00, + 0x00F00FF00F00F000, + 0xFFF000F000F00000, + 0x00F00F000FF00000, + 0x0000FF0F0000F000 +}, +{ + 0xF0FFFFFFF0F00F00, + 0x00FFF0FFFF0000FF, + 0x00FF00000F0F0FFF, + 0xF000F0000F00FF0F, + 0xFF000000FFF00000, + 0xF0FF000FF00F0FF0, + 0x0F0F0F00FF000F0F, + 0x0F0F00F0F0F0F000, + 0x00F00F00F00F000F, + 0x00F0F0F00000FFF0, + 0xFFFFFF0FF00F0FFF, + 0x0F0FFFF00FFFFFFF, + 0xFFFF0F0FFF0FFF00 +}, +{ + 0x0F0F00FF0FF0FFFF, + 0xF000F0F00F00FF0F, + 0x000FFFF0FFF0FF0F, + 0x00F00FFF00000FF0, + 0xFFFFF0000FFFF00F, + 0xFFF0FFF0000FFFF0, + 0xF0F0F0000F0F0F00, + 0x00F000F0F00FFF00, + 0xF0FF0F0FFF00F0FF, + 0xF0FF0FFFF0F0F0FF, + 0x00FFFFFFFFFFFFF0, + 0x00FFF0F0FF000F0F, + 0x000FFFF0000FFF00 +}, +{ + 0xFF0F0F00F000F0FF, + 0x0FFFFFFFFF00000F, + 0xF0FFFF000F00F0FF, + 0x0F0000F00FFF0FFF, + 0x0F0F0F00FF0F000F, + 0x000F0F0FFFF0F000, + 0xF0FFFF0F00F0FF0F, + 0x0F0F000F0F00F0FF, + 0x0000F0FF00FF0F0F, + 0x00FFFF0FF0FFF0F0, + 0x0000000F00F0FFF0, + 0xF0F00000FF00F0F0, + 0x0F0F0FFFFFFFFFFF +}}, +{{ + 0x00FF0000000000FF, + 0xFFFFFFFFFF00FF00, + 0xFF0000FF00FF0000, + 0xFFFF000000FF0000, + 0xFF00000000FF0000, + 0x00FFFFFFFF000000, + 0xFF0000FFFFFF0000, + 0xFF00FF00FFFF0000, + 0x00FFFFFFFF00FF00, + 0xFFFF000000000000, + 0x00FF0000FF000000, + 0xFF00FF00FF000000, + 0x00FF00FFFF000000 +}, +{ + 0x00FF00FF00FF0000, + 0xFF00FFFF000000FF, + 0x0000FFFF000000FF, + 0x00FFFF00FF000000, + 0xFFFFFF0000FF00FF, + 0x0000FFFF00FFFF00, + 0xFF00FF0000FFFF00, + 0x00000000FFFFFFFF, + 0x0000FF0000000000, + 0xFF00FFFF00FFFF00, + 0x00FFFF00000000FF, + 0x0000FF00FF00FFFF, + 0xFF0000FFFFFF0000 +}, +{ + 0xFFFF00FF00FF00FF, + 0x00FFFF000000FF00, + 0xFFFF00FFFFFFFF00, + 0x0000FFFF00FFFFFF, + 0x00FF0000FF0000FF, + 0xFFFF0000FF00FFFF, + 0xFF000000FFFFFF00, + 0x000000000000FFFF, + 0xFF00FF00FFFF0000, + 0xFFFF00FFFF00FFFF, + 0xFFFFFFFFFF00FF00, + 0xFFFF00FFFF0000FF, + 0x0000FF00000000FF +}, +{ + 0xFF0000FFFFFF00FF, + 0xFFFF0000FFFFFFFF, + 0xFFFF000000FFFFFF, + 0x00FFFF00FF0000FF, + 0xFFFFFF00FFFFFF00, + 0x00FFFF00FFFF00FF, + 0x0000FFFF00FF0000, + 0x000000FFFF000000, + 0xFF00FF0000FF00FF, + 0x00FF0000000000FF, + 0xFF00FFFF00FF00FF, + 0xFFFFFFFFFFFFFFFF, + 0x0000FF000000FFFF +}}, +{{ + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0x0000000000000000, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFF0000, + 0x0000FFFF00000000, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFF0000, + 0x0000FFFF00000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000FFFF00000000, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000FFFF00000000, + 0xFFFF0000FFFF0000, + 0x0000FFFFFFFF0000, + 0x0000FFFF0000FFFF, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFF0000, + 0xFFFF0000FFFFFFFF, + 0xFFFF0000FFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF000000000000, + 0x0000FFFF00000000, + 0x00000000FFFF0000, + 0x0000FFFFFFFFFFFF, + 0x0000FFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0x000000000000FFFF, + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0xFFFFFFFF0000FFFF, + 0xFFFF0000FFFFFFFF +}, +{ + 0x0000FFFFFFFFFFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFFFFFF, + 0x00000000FFFF0000, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFF00000000, + 0xFFFFFFFF00000000, + 0x0000FFFFFFFF0000, + 0x0000FFFFFFFFFFFF +}}, +{{ + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000 +}, +{ + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000 +}}, +{{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF +}}, diff --git a/crypto_kem/mceliece6960119f/vec/sk_gen.c b/crypto_kem/mceliece6960119f/vec/sk_gen.c new file mode 100644 index 00000000..ed56ab8f --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6960119F_VEC_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6960119F_VEC_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6960119F_VEC_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6960119F_VEC_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6960119F_VEC_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6960119F_VEC_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6960119F_VEC_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6960119F_VEC_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece6960119f/vec/sk_gen.h b/crypto_kem/mceliece6960119f/vec/sk_gen.h new file mode 100644 index 00000000..76bc2bfd --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_SK_GEN_H +#define PQCLEAN_MCELIECE6960119F_VEC_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6960119F_VEC_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6960119F_VEC_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece6960119f/vec/transpose.c b/crypto_kem/mceliece6960119f/vec/transpose.c new file mode 100644 index 00000000..3aaa9e53 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/transpose.c @@ -0,0 +1,35 @@ +#include "transpose.h" + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} diff --git a/crypto_kem/mceliece6960119f/vec/transpose.h b/crypto_kem/mceliece6960119f/vec/transpose.h new file mode 100644 index 00000000..4c95634e --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/transpose.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_TRANSPOSE_H +#define PQCLEAN_MCELIECE6960119F_VEC_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + + +void PQCLEAN_MCELIECE6960119F_VEC_transpose_64x64(uint64_t *out, const uint64_t *in); + +#endif + diff --git a/crypto_kem/mceliece6960119f/vec/util.c b/crypto_kem/mceliece6960119f/vec/util.c new file mode 100644 index 00000000..56b16065 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/util.c @@ -0,0 +1,97 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ +#include "util.h" + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE6960119F_VEC_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6960119F_VEC_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6960119F_VEC_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6960119F_VEC_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6960119F_VEC_irr_load(vec out[][GFBITS], const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6960119F_VEC_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[0][i] = v[0]; + out[1][i] = v[1]; + } +} + +void PQCLEAN_MCELIECE6960119F_VEC_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6960119F_VEC_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} diff --git a/crypto_kem/mceliece6960119f/vec/util.h b/crypto_kem/mceliece6960119f/vec/util.h new file mode 100644 index 00000000..474f9f45 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/util.h @@ -0,0 +1,27 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_UTIL_H +#define PQCLEAN_MCELIECE6960119F_VEC_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE6960119F_VEC_store_i(unsigned char *out, uint64_t in, int i); + +void PQCLEAN_MCELIECE6960119F_VEC_store2(unsigned char *dest, uint16_t a); + +uint16_t PQCLEAN_MCELIECE6960119F_VEC_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE6960119F_VEC_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE6960119F_VEC_irr_load(vec out[][GFBITS], const unsigned char *in); + +void PQCLEAN_MCELIECE6960119F_VEC_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE6960119F_VEC_load8(const unsigned char *in); + +#endif + diff --git a/crypto_kem/mceliece6960119f/vec/vec.c b/crypto_kem/mceliece6960119f/vec/vec.c new file mode 100644 index 00000000..a7af3241 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/vec.c @@ -0,0 +1,138 @@ +#include "vec.h" + +#include "params.h" + +vec PQCLEAN_MCELIECE6960119F_VEC_vec_setbits(vec b) { + vec ret = -b; + + return ret; +} + +vec PQCLEAN_MCELIECE6960119F_VEC_vec_set1_16b(uint16_t v) { + vec ret; + + ret = v; + ret |= ret << 16; + ret |= ret << 32; + + return ret; +} + +void PQCLEAN_MCELIECE6960119F_VEC_vec_copy(vec *out, const vec *in) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i] = in[i]; + } +} + +vec PQCLEAN_MCELIECE6960119F_VEC_vec_or_reduce(const vec *a) { + int i; + vec ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret |= a[i]; + } + + return ret; +} + +int PQCLEAN_MCELIECE6960119F_VEC_vec_testz(vec a) { + a |= a >> 32; + a |= a >> 16; + a |= a >> 8; + a |= a >> 4; + a |= a >> 2; + a |= a >> 1; + + return (int)(a & 1) ^ 1; +} + +void PQCLEAN_MCELIECE6960119F_VEC_vec_mul(vec *h, const vec *f, const vec *g) { + int i, j; + vec buf[ 2 * GFBITS - 1 ]; + + for (i = 0; i < 2 * GFBITS - 1; i++) { + buf[i] = 0; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < GFBITS; j++) { + buf[i + j] ^= f[i] & g[j]; + } + } + + for (i = 2 * GFBITS - 2; i >= GFBITS; i--) { + buf[i - GFBITS + 4] ^= buf[i]; + buf[i - GFBITS + 3] ^= buf[i]; + buf[i - GFBITS + 1] ^= buf[i]; + buf[i - GFBITS + 0] ^= buf[i]; + } + + for (i = 0; i < GFBITS; i++) { + h[i] = buf[i]; + } +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE6960119F_VEC_vec_sq(vec *out, const vec *in) { + int i; + vec result[GFBITS], t; + + t = in[11] ^ in[12]; + + result[0] = in[0] ^ in[11]; + result[1] = in[7] ^ t; + result[2] = in[1] ^ in[7]; + result[3] = in[8] ^ t; + result[4] = in[2] ^ in[7]; + result[4] = result[4] ^ in[8]; + result[4] = result[4] ^ t; + result[5] = in[7] ^ in[9]; + result[6] = in[3] ^ in[8]; + result[6] = result[6] ^ in[9]; + result[6] = result[6] ^ in[12]; + result[7] = in[8] ^ in[10]; + result[8] = in[4] ^ in[9]; + result[8] = result[8] ^ in[10]; + result[9] = in[9] ^ in[11]; + result[10] = in[5] ^ in[10]; + result[10] = result[10] ^ in[11]; + result[11] = in[10] ^ in[12]; + result[12] = in[6] ^ t; + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE6960119F_VEC_vec_inv(vec *out, const vec *in) { + vec tmp_11[ GFBITS ]; + vec tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE6960119F_VEC_vec_copy(out, in); + + PQCLEAN_MCELIECE6960119F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE6960119F_VEC_vec_sq(out, tmp_11); + PQCLEAN_MCELIECE6960119F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE6960119F_VEC_vec_sq(out, tmp_1111); + PQCLEAN_MCELIECE6960119F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE6960119F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE6960119F_VEC_vec_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE6960119F_VEC_vec_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece6960119f/vec/vec.h b/crypto_kem/mceliece6960119f/vec/vec.h new file mode 100644 index 00000000..bf0f93d0 --- /dev/null +++ b/crypto_kem/mceliece6960119f/vec/vec.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE6960119F_VEC_VEC_H +#define PQCLEAN_MCELIECE6960119F_VEC_VEC_H + +#include "params.h" + +#include + +typedef uint64_t vec; + +vec PQCLEAN_MCELIECE6960119F_VEC_vec_setbits(vec b); + +vec PQCLEAN_MCELIECE6960119F_VEC_vec_set1_16b(uint16_t v); + +void PQCLEAN_MCELIECE6960119F_VEC_vec_copy(vec *out, const vec *in); + +vec PQCLEAN_MCELIECE6960119F_VEC_vec_or_reduce(const vec *a); + +int PQCLEAN_MCELIECE6960119F_VEC_vec_testz(vec a); + +void PQCLEAN_MCELIECE6960119F_VEC_vec_mul(vec * /*h*/, const vec * /*f*/, const vec * /*g*/); +void PQCLEAN_MCELIECE6960119F_VEC_vec_sq(vec * /*out*/, const vec * /*in*/); +void PQCLEAN_MCELIECE6960119F_VEC_vec_inv(vec * /*out*/, const vec * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/META.yml b/crypto_kem/mceliece8192128/META.yml new file mode 100644 index 00000000..b809f98f --- /dev/null +++ b/crypto_kem/mceliece8192128/META.yml @@ -0,0 +1,48 @@ +name: Classic McEliece 8192128 +type: kem +claimed-nist-level: 5 +claimed-security: IND-CCA2 +length-public-key: 1357824 +length-secret-key: 14080 +length-ciphertext: 240 +length-shared-secret: 32 +nistkat-sha256: be85dab645c70e3a5eb91edcef125b2ae3838a8742e1fccf199149c4b814e357 +principal-submitters: + - Daniel J. Bernstein + - Tung Chou + - Tanja Lange + - Ingo von Maurich + - Rafael Misoczki + - Ruben Niederhagen + - Edoardo Persichetti + - Christiane Peters + - Peter Schwabe + - Nicolas Sendrier + - Jakub Szefer + - Wen Wang +auxiliary-submitters: [] +implementations: + - name: clean + version: SUPERCOP-20191221 + - name: vec + version: SUPERCOP-20191221 + - name: sse + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - sse4_1 + - popcnt + - name: avx + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - popcnt diff --git a/crypto_kem/mceliece8192128/avx/LICENSE b/crypto_kem/mceliece8192128/avx/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece8192128/avx/Makefile b/crypto_kem/mceliece8192128/avx/Makefile new file mode 100644 index 00000000..9c3088ad --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/Makefile @@ -0,0 +1,44 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece8192128_avx.a + + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c int32_sort.c operations.c pk_gen.c sk_gen.c \ + transpose.c uint32_sort.c util.c vec128.c vec256.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S \ + transpose_64x256_sp_asm.S update_asm.S vec128_mul_asm.S \ + vec256_ama_asm.S vec256_maa_asm.S vec256_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h int32_sort.h \ + params.h pk_gen.h sk_gen.h transpose.h uint32_sort.h util.h vec128.h \ + vec256.h \ + consts.inc powers.inc scalars_2x.inc scalars_4x.inc + + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o int32_sort.o operations.o pk_gen.o sk_gen.o \ + transpose.o uint32_sort.o util.o vec128.o vec256.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + transpose_64x256_sp_asm.o update_asm.o vec128_mul_asm.o \ + vec256_ama_asm.o vec256_maa_asm.o vec256_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mpopcnt -mavx2 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece8192128/avx/aes256ctr.c b/crypto_kem/mceliece8192128/avx/aes256ctr.c new file mode 100644 index 00000000..f26e16bd --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE8192128_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece8192128/avx/aes256ctr.h b/crypto_kem/mceliece8192128/avx/aes256ctr.h new file mode 100644 index 00000000..c7c1c4be --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_AES256CTR_H +#define PQCLEAN_MCELIECE8192128_AVX_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE8192128_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece8192128/avx/api.h b/crypto_kem/mceliece8192128/avx/api.h new file mode 100644 index 00000000..1e5cc1c1 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_API_H +#define PQCLEAN_MCELIECE8192128_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE8192128_AVX_CRYPTO_ALGNAME "Classic McEliece 8192128" +#define PQCLEAN_MCELIECE8192128_AVX_CRYPTO_PUBLICKEYBYTES 1357824 +#define PQCLEAN_MCELIECE8192128_AVX_CRYPTO_SECRETKEYBYTES 14080 +#define PQCLEAN_MCELIECE8192128_AVX_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE8192128_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece8192128/avx/benes.c b/crypto_kem/mceliece8192128/avx/benes.c new file mode 100644 index 00000000..11018b42 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE8192128_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE8192128_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(PQCLEAN_MCELIECE8192128_AVX_load8(ptr), PQCLEAN_MCELIECE8192128_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE8192128_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(PQCLEAN_MCELIECE8192128_AVX_load8(ptr), PQCLEAN_MCELIECE8192128_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE8192128_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece8192128/avx/benes.h b/crypto_kem/mceliece8192128/avx/benes.h new file mode 100644 index 00000000..0409a46c --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_BENES_H +#define PQCLEAN_MCELIECE8192128_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE8192128_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE8192128_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/avx/bm.c b/crypto_kem/mceliece8192128/avx/bm.c new file mode 100644 index 00000000..3bbd29fb --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/bm.c @@ -0,0 +1,214 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE8192128_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE8192128_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE8192128_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE8192128_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE8192128_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE8192128_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE8192128_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE8192128_AVX_vec256_or(PQCLEAN_MCELIECE8192128_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE8192128_AVX_vec256_sll_4x(PQCLEAN_MCELIECE8192128_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE8192128_AVX_vec256_or(PQCLEAN_MCELIECE8192128_AVX_vec256_srl_4x(PQCLEAN_MCELIECE8192128_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE8192128_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE8192128_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + vec128 db[ GFBITS ][ 2 ]; + vec128 BC_tmp[ GFBITS ][ 2 ]; + vec128 BC[ GFBITS ][ 2 ]; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC[0][0] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0, one << 63); + BC[0][1] = PQCLEAN_MCELIECE8192128_AVX_vec128_setzero(); + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = PQCLEAN_MCELIECE8192128_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_setzero(); + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm(prod, interval, BC[0] + 1, 32); + PQCLEAN_MCELIECE8192128_AVX_update_asm(interval, coefs[N], 16); + + d = PQCLEAN_MCELIECE8192128_AVX_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE8192128_AVX_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = PQCLEAN_MCELIECE8192128_AVX_vec128_setbits((d >> i) & 1); + db[i][1] = PQCLEAN_MCELIECE8192128_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_mul((vec256 *) BC_tmp, (vec256 *) db, (vec256 *) BC); + + vec128_cmov(BC, mask); + PQCLEAN_MCELIECE8192128_AVX_update_asm(BC, c0 & mask, 32); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(BC_tmp[i][0], BC_tmp[i][1]); + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE8192128_AVX_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + prod[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm(out, prod, BC[0] + 1, 32); +} + diff --git a/crypto_kem/mceliece8192128/avx/bm.h b/crypto_kem/mceliece8192128/avx/bm.h new file mode 100644 index 00000000..f1ca018b --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_BM_H +#define PQCLEAN_MCELIECE8192128_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE8192128_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/avx/consts.S b/crypto_kem/mceliece8192128/avx/consts.S new file mode 100644 index 00000000..e34172ea --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE8192128_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE8192128_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE8192128_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE8192128_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE8192128_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE8192128_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE8192128_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE8192128_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE8192128_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE8192128_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE8192128_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE8192128_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE8192128_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece8192128/avx/consts.inc b/crypto_kem/mceliece8192128/avx/consts.inc new file mode 100644 index 00000000..cc354957 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/crypto_kem/mceliece8192128/avx/controlbits.c b/crypto_kem/mceliece8192128/avx/controlbits.c new file mode 100644 index 00000000..ec290b84 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE8192128_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE8192128_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE8192128_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE8192128_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece8192128/avx/controlbits.h b/crypto_kem/mceliece8192128/avx/controlbits.h new file mode 100644 index 00000000..43a8a9ea --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE8192128_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE8192128_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE8192128_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece8192128/avx/crypto_hash.h b/crypto_kem/mceliece8192128/avx/crypto_hash.h new file mode 100644 index 00000000..23d1c160 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE8192128_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece8192128/avx/decrypt.c b/crypto_kem/mceliece8192128/avx/decrypt.c new file mode 100644 index 00000000..1fc3dcc4 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/decrypt.c @@ -0,0 +1,207 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE8192128_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE8192128_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE8192128_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE8192128_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + + recv[0] = PQCLEAN_MCELIECE8192128_AVX_vec128_setbits(0); + + for (i = 1; i < 64; i++) { + recv[i] = recv[0]; + } + + for (i = 0; i < SYND_BYTES / 16; i++) { + recv[i] = PQCLEAN_MCELIECE8192128_AVX_load16(s + i * 16); + } +} + +static int weight(vec256 *v) { + int i, w = 0; + + for (i = 0; i < 32; i++) { + w += (int)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128_AVX_vec256_extract(v[i], 0) ); + w += (int)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128_AVX_vec256_extract(v[i], 1) ); + w += (int)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128_AVX_vec256_extract(v[i], 2) ); + w += (int)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128_AVX_vec256_extract(v[i], 3) ); + } + + return w; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec256_or(diff, PQCLEAN_MCELIECE8192128_AVX_vec256_xor(s0[i], s1[i])); + } + + return PQCLEAN_MCELIECE8192128_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE8192128_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 64 ][ GFBITS ]; + vec256 scaled[ 64 ][ GFBITS ]; + vec256 eval[ 64 ][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE8192128_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE8192128_AVX_benes(recv128, bits_int, 1); + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); // scaling + PQCLEAN_MCELIECE8192128_AVX_fft_tr(s_priv, scaled); // transposed FFT + PQCLEAN_MCELIECE8192128_AVX_bm(locator, s_priv); // Berlekamp Massey + + PQCLEAN_MCELIECE8192128_AVX_fft(eval, locator); // FFT + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(error256[i], allone); + } + + check_weight = (uint16_t)(weight(error256) ^ SYS_T); + check_weight -= 1; + check_weight >>= 15; + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE8192128_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE8192128_AVX_benes(error128, bits_int, 0); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE8192128_AVX_store16(e + i * 16, error128[i]); + } + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece8192128/avx/decrypt.h b/crypto_kem/mceliece8192128/avx/decrypt.h new file mode 100644 index 00000000..ad28767e --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE8192128_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE8192128_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/avx/encrypt.c b/crypto_kem/mceliece8192128/avx/encrypt.c new file mode 100644 index 00000000..d33b1eb5 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/encrypt.c @@ -0,0 +1,80 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE8192128_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq; + + uint16_t ind[ SYS_T ]; + int32_t ind32[ SYS_T ]; + uint64_t e_int[ SYS_N / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T; i++) { + ind32[i] = ind[i] &= GFMASK; + } + + // check for repetition + + PQCLEAN_MCELIECE8192128_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind[j] & 63); + } + + for (i = 0; i < SYS_N / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < SYS_N / 64; i++) { + PQCLEAN_MCELIECE8192128_AVX_store8(e + i * 8, e_int[i]); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE8192128_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE8192128_AVX_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece8192128/avx/encrypt.h b/crypto_kem/mceliece8192128/avx/encrypt.h new file mode 100644 index 00000000..3cbf3451 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE8192128_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE8192128_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/avx/fft.c b/crypto_kem/mceliece8192128/avx/fft.c new file mode 100644 index 00000000..e39e9ad2 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/fft.c @@ -0,0 +1,275 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE8192128_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE8192128_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE8192128_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE8192128_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE8192128_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec256 powers[ 32 ][ GFBITS ] = { +#include "powers.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE8192128_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + + // transpose + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 32; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(out[i][b], powers[i][b]); + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE8192128_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece8192128/avx/fft.h b/crypto_kem/mceliece8192128/avx/fft.h new file mode 100644 index 00000000..808773f6 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_FFT_H +#define PQCLEAN_MCELIECE8192128_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE8192128_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/avx/fft_tr.c b/crypto_kem/mceliece8192128/avx/fft_tr.c new file mode 100644 index 00000000..7f8e71b9 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/fft_tr.c @@ -0,0 +1,379 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE8192128_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE8192128_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE8192128_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE8192128_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +void PQCLEAN_MCELIECE8192128_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece8192128/avx/fft_tr.h b/crypto_kem/mceliece8192128/avx/fft_tr.h new file mode 100644 index 00000000..76f999aa --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE8192128_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE8192128_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece8192128/avx/gf.c b/crypto_kem/mceliece8192128/avx/gf.c new file mode 100644 index 00000000..b69a7c1a --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128_AVX_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE8192128_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE8192128_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE8192128_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE8192128_AVX_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE8192128_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE8192128_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE8192128_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE8192128_AVX_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE8192128_AVX_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE8192128_AVX_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece8192128/avx/gf.h b/crypto_kem/mceliece8192128/avx/gf.h new file mode 100644 index 00000000..a3ccb254 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/gf.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_GF_H +#define PQCLEAN_MCELIECE8192128_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE8192128_AVX_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE8192128_AVX_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE8192128_AVX_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE8192128_AVX_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE8192128_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128_AVX_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/crypto_kem/mceliece8192128/avx/int32_sort.c b/crypto_kem/mceliece8192128/avx/int32_sort.c new file mode 100644 index 00000000..448e778b --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/int32_sort.c @@ -0,0 +1,1208 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = (p << 1 == q); + flipflip = !flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE8192128_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE8192128_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/crypto_kem/mceliece8192128/avx/int32_sort.h b/crypto_kem/mceliece8192128/avx/int32_sort.h new file mode 100644 index 00000000..cd5240b4 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE8192128_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE8192128_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece8192128/avx/operations.c b/crypto_kem/mceliece8192128/avx/operations.c new file mode 100644 index 00000000..4cd5d334 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE8192128_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE8192128_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE8192128_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE8192128_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE8192128_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE8192128_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE8192128_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE8192128_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE8192128_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE8192128_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128/avx/params.h b/crypto_kem/mceliece8192128/avx/params.h new file mode 100644 index 00000000..5e2934b9 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_PARAMS_H +#define PQCLEAN_MCELIECE8192128_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 8192 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece8192128/avx/pk_gen.c b/crypto_kem/mceliece8192128/avx/pk_gen.c new file mode 100644 index 00000000..cc6c83b0 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/pk_gen.c @@ -0,0 +1,288 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256) +int PQCLEAN_MCELIECE8192128_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int i, j, k; + int row, c, d; + + uint64_t mat[ GFBITS * SYS_T ][ 128 ]; + uint64_t ops[ GFBITS * SYS_T ][ GFBITS * SYS_T / 64 ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ (SYS_N - GFBITS * SYS_T) / 64 ]; + + // compute the inverses + + PQCLEAN_MCELIECE8192128_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE8192128_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE8192128_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE8192128_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_I; j++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + ops[ row ][ c ] = 0; + } + } + } + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + ops[ row ][ i ] = 1; + ops[ row ][ i ] <<= j; + } + } + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (i = (GFBITS * SYS_T) / 64 - 1; i >= 0; i--) { + for (j = 63; j >= 0; j--) { + row = i * 64 + j; + + for (k = 0; k < row; k++) { + { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + for (k = 0; k < (SYS_N - GFBITS * SYS_T) / 64; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + for (d = 0; d < 64; d++) { + mask = ops[ row ][ c ] >> d; + mask &= 1; + mask = -mask; + + for (k = 0; k < (SYS_N - GFBITS * SYS_T) / 64; k++) { + one_row[ k ] ^= mat[ c * 64 + d ][ k + (GFBITS * SYS_T) / 64 ] & mask; + } + } + } + + for (k = 0; k < (SYS_N - GFBITS * SYS_T) / 64; k++) { + PQCLEAN_MCELIECE8192128_AVX_store8(pk, one_row[ k ]); + pk += 8; + } + } + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece8192128/avx/pk_gen.h b/crypto_kem/mceliece8192128/avx/pk_gen.h new file mode 100644 index 00000000..2e30edc6 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE8192128_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE8192128_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/avx/powers.inc b/crypto_kem/mceliece8192128/avx/powers.inc new file mode 100644 index 00000000..0219933d --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/powers.inc @@ -0,0 +1,480 @@ +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +} diff --git a/crypto_kem/mceliece8192128/avx/scalars_2x.inc b/crypto_kem/mceliece8192128/avx/scalars_2x.inc new file mode 100644 index 00000000..7ea5a308 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece8192128/avx/scalars_4x.inc b/crypto_kem/mceliece8192128/avx/scalars_4x.inc new file mode 100644 index 00000000..57b78117 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/crypto_kem/mceliece8192128/avx/sk_gen.c b/crypto_kem/mceliece8192128/avx/sk_gen.c new file mode 100644 index 00000000..7901b18a --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE8192128_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE8192128_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE8192128_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE8192128_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE8192128_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE8192128_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE8192128_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE8192128_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128/avx/sk_gen.h b/crypto_kem/mceliece8192128/avx/sk_gen.h new file mode 100644 index 00000000..d0d4c53a --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE8192128_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE8192128_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE8192128_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/avx/syndrome_asm.S b/crypto_kem/mceliece8192128/avx/syndrome_asm.S new file mode 100644 index 00000000..5afc055a --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/syndrome_asm.S @@ -0,0 +1,910 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128_AVX_syndrome_asm +.global PQCLEAN_MCELIECE8192128_AVX_syndrome_asm +_PQCLEAN_MCELIECE8192128_AVX_syndrome_asm: +PQCLEAN_MCELIECE8192128_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 1357008 +# asm 1: add $1357008,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1664 +# asm 1: mov $1664,>row=int64#5 +# asm 2: mov $1664,>row=%r8 +mov $1664,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 208 ] +# asm 1: vmovupd 208(ee=reg256#2 +# asm 2: vmovupd 208(ee=%ymm1 +vmovupd 208(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 240 ] +# asm 1: vmovupd 240(ee=reg256#3 +# asm 2: vmovupd 240(ee=%ymm2 +vmovupd 240(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 272 ] +# asm 1: vmovupd 272(ee=reg256#3 +# asm 2: vmovupd 272(ee=%ymm2 +vmovupd 272(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 304 ] +# asm 1: vmovupd 304(ee=reg256#3 +# asm 2: vmovupd 304(ee=%ymm2 +vmovupd 304(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 336 ] +# asm 1: vmovupd 336(ee=reg256#3 +# asm 2: vmovupd 336(ee=%ymm2 +vmovupd 336(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 368 ] +# asm 1: vmovupd 368(ee=reg256#3 +# asm 2: vmovupd 368(ee=%ymm2 +vmovupd 368(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 400 ] +# asm 1: vmovupd 400(ee=reg256#3 +# asm 2: vmovupd 400(ee=%ymm2 +vmovupd 400(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 432 ] +# asm 1: vmovupd 432(ee=reg256#3 +# asm 2: vmovupd 432(ee=%ymm2 +vmovupd 432(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 464 ] +# asm 1: vmovupd 464(ee=reg256#3 +# asm 2: vmovupd 464(ee=%ymm2 +vmovupd 464(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 496 ] +# asm 1: vmovupd 496(ee=reg256#3 +# asm 2: vmovupd 496(ee=%ymm2 +vmovupd 496(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 528 ] +# asm 1: vmovupd 528(ee=reg256#3 +# asm 2: vmovupd 528(ee=%ymm2 +vmovupd 528(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 560 ] +# asm 1: vmovupd 560(ee=reg256#3 +# asm 2: vmovupd 560(ee=%ymm2 +vmovupd 560(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 592 ] +# asm 1: vmovupd 592(ee=reg256#3 +# asm 2: vmovupd 592(ee=%ymm2 +vmovupd 592(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 416(pp=%ymm1 +vmovupd 416(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 624 ] +# asm 1: vmovupd 624(ee=reg256#3 +# asm 2: vmovupd 624(ee=%ymm2 +vmovupd 624(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 448(pp=%ymm1 +vmovupd 448(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 656 ] +# asm 1: vmovupd 656(ee=reg256#3 +# asm 2: vmovupd 656(ee=%ymm2 +vmovupd 656(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 480(pp=%ymm1 +vmovupd 480(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 688 ] +# asm 1: vmovupd 688(ee=reg256#3 +# asm 2: vmovupd 688(ee=%ymm2 +vmovupd 688(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 512(pp=%ymm1 +vmovupd 512(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 720 ] +# asm 1: vmovupd 720(ee=reg256#3 +# asm 2: vmovupd 720(ee=%ymm2 +vmovupd 720(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 544(pp=%ymm1 +vmovupd 544(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 752 ] +# asm 1: vmovupd 752(ee=reg256#3 +# asm 2: vmovupd 752(ee=%ymm2 +vmovupd 752(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 576(pp=%ymm1 +vmovupd 576(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 784 ] +# asm 1: vmovupd 784(ee=reg256#3 +# asm 2: vmovupd 784(ee=%ymm2 +vmovupd 784(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 608(pp=%ymm1 +vmovupd 608(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 816 ] +# asm 1: vmovupd 816(ee=reg256#3 +# asm 2: vmovupd 816(ee=%ymm2 +vmovupd 816(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 640(pp=%ymm1 +vmovupd 640(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 848 ] +# asm 1: vmovupd 848(ee=reg256#3 +# asm 2: vmovupd 848(ee=%ymm2 +vmovupd 848(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 672(pp=%ymm1 +vmovupd 672(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 880 ] +# asm 1: vmovupd 880(ee=reg256#3 +# asm 2: vmovupd 880(ee=%ymm2 +vmovupd 880(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 704(pp=%ymm1 +vmovupd 704(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 912 ] +# asm 1: vmovupd 912(ee=reg256#3 +# asm 2: vmovupd 912(ee=%ymm2 +vmovupd 912(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 736(pp=%ymm1 +vmovupd 736(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 944 ] +# asm 1: vmovupd 944(ee=reg256#3 +# asm 2: vmovupd 944(ee=%ymm2 +vmovupd 944(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 768(pp=%ymm1 +vmovupd 768(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 976 ] +# asm 1: vmovupd 976(ee=reg256#3 +# asm 2: vmovupd 976(ee=%ymm2 +vmovupd 976(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = mem64[input_1 + 800] +# asm 1: movq 800(s=int64#6 +# asm 2: movq 800(s=%r9 +movq 800(%rsi),%r9 + +# qhasm: e = mem64[input_2 + 1008] +# asm 1: movq 1008(e=int64#7 +# asm 2: movq 1008(e=%rax +movq 1008(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 808(p=%rax +movq 808(%rsi),%rax + +# qhasm: e = mem64[input_2 + 1016] +# asm 1: movq 1016(e=int64#8 +# asm 2: movq 1016(e=%r10 +movq 1016(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 96(ss=%ymm0 +vmovupd 96(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 128(ss=%ymm0 +vmovupd 128(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#2 +# asm 2: vmovupd 128(ee=%ymm1 +vmovupd 128(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 160(ss=%ymm0 +vmovupd 160(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#2 +# asm 2: vmovupd 160(ee=%ymm1 +vmovupd 160(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor s=int64#2 +# asm 2: movq 192(s=%rsi +movq 192(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 192 ] +# asm 1: movq 192(e=int64#4 +# asm 2: movq 192(e=%rcx +movq 192(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 200(s=%rsi +movq 200(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 200 ] +# asm 1: movq 200(e=int64#3 +# asm 2: movq 200(e=%rdx +movq 200(%rdx),%rdx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE8192128_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece8192128/avx/update_asm.S b/crypto_kem/mceliece8192128/avx/update_asm.S new file mode 100644 index 00000000..2c930045 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128_AVX_update_asm +.global PQCLEAN_MCELIECE8192128_AVX_update_asm +_PQCLEAN_MCELIECE8192128_AVX_update_asm: +PQCLEAN_MCELIECE8192128_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE8192128_AVX_store_i(unsigned char *out, uint64_t in, int i) { + for (int j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE8192128_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE8192128_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE8192128_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE8192128_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v0 = 0, v1 = 0; + uint16_t irr[ SYS_T ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE8192128_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 63; j >= 0; j--) { + v0 <<= 1; + v1 <<= 1; + v0 |= (irr[j] >> i) & 1; + v1 |= (irr[j + 64] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(v0, v1); + } +} + +void PQCLEAN_MCELIECE8192128_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE8192128_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE8192128_AVX_vec128_set2x( PQCLEAN_MCELIECE8192128_AVX_load8(in), PQCLEAN_MCELIECE8192128_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE8192128_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE8192128_AVX_store8(out + 0, PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE8192128_AVX_store8(out + 8, PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece8192128/avx/util.h b/crypto_kem/mceliece8192128/avx/util.h new file mode 100644 index 00000000..17b7538f --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_UTIL_H +#define PQCLEAN_MCELIECE8192128_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE8192128_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE8192128_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE8192128_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE8192128_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE8192128_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE8192128_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE8192128_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE8192128_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE8192128_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece8192128/avx/vec128.c b/crypto_kem/mceliece8192128/avx/vec128.c new file mode 100644 index 00000000..7403dde1 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE8192128_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE8192128_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE8192128_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/crypto_kem/mceliece8192128/avx/vec128.h b/crypto_kem/mceliece8192128/avx/vec128.h new file mode 100644 index 00000000..c0cc80f7 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_VEC128_H +#define PQCLEAN_MCELIECE8192128_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE8192128_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE8192128_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE8192128_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE8192128_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/crypto_kem/mceliece8192128/avx/vec128_mul_asm.S b/crypto_kem/mceliece8192128/avx/vec128_mul_asm.S new file mode 100644 index 00000000..d886c3da --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE8192128_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE8192128_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE8192128_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE8192128_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE8192128_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE8192128_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/crypto_kem/mceliece8192128/avx/vec256_ama_asm.S b/crypto_kem/mceliece8192128/avx/vec256_ama_asm.S new file mode 100644 index 00000000..3a6f7587 --- /dev/null +++ b/crypto_kem/mceliece8192128/avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE8192128_CLEAN_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece8192128/clean/api.h b/crypto_kem/mceliece8192128/clean/api.h new file mode 100644 index 00000000..ddebf4ef --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_API_H +#define PQCLEAN_MCELIECE8192128_CLEAN_API_H + +#include + +#define PQCLEAN_MCELIECE8192128_CLEAN_CRYPTO_ALGNAME "Classic McEliece 8192128" +#define PQCLEAN_MCELIECE8192128_CLEAN_CRYPTO_PUBLICKEYBYTES 1357824 +#define PQCLEAN_MCELIECE8192128_CLEAN_CRYPTO_SECRETKEYBYTES 14080 +#define PQCLEAN_MCELIECE8192128_CLEAN_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE8192128_CLEAN_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE8192128_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE8192128_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE8192128_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece8192128/clean/benes.c b/crypto_kem/mceliece8192128/clean/benes.c new file mode 100644 index 00000000..69c1a02e --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/benes.c @@ -0,0 +1,180 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE8192128_CLEAN_apply_benes(unsigned char *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + unsigned char *r_ptr = r; + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = PQCLEAN_MCELIECE8192128_CLEAN_load8(r_ptr + i * 16 + 0); + r_int_v[1][i] = PQCLEAN_MCELIECE8192128_CLEAN_load8(r_ptr + i * 16 + 8); + } + + PQCLEAN_MCELIECE8192128_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE8192128_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE8192128_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE8192128_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE8192128_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE8192128_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE8192128_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE8192128_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE8192128_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE8192128_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE8192128_CLEAN_store8(r_ptr + i * 16 + 0, r_int_v[0][i]); + PQCLEAN_MCELIECE8192128_CLEAN_store8(r_ptr + i * 16 + 8, r_int_v[1][i]); + } +} + +/* input: condition bits c */ +/* output: support s */ +void PQCLEAN_MCELIECE8192128_CLEAN_support_gen(gf *s, const unsigned char *c) { + gf a; + int i, j; + unsigned char L[ GFBITS ][ (1 << GFBITS) / 8 ]; + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < (1 << GFBITS) / 8; j++) { + L[i][j] = 0; + } + } + + for (i = 0; i < (1 << GFBITS); i++) { + a = PQCLEAN_MCELIECE8192128_CLEAN_bitrev((gf) i); + + for (j = 0; j < GFBITS; j++) { + L[j][ i / 8 ] |= ((a >> j) & 1) << (i % 8); + } + } + + for (j = 0; j < GFBITS; j++) { + PQCLEAN_MCELIECE8192128_CLEAN_apply_benes(L[j], c, 0); + } + + for (i = 0; i < SYS_N; i++) { + s[i] = 0; + for (j = GFBITS - 1; j >= 0; j--) { + s[i] <<= 1; + s[i] |= (L[j][i / 8] >> (i % 8)) & 1; + } + } +} + diff --git a/crypto_kem/mceliece8192128/clean/benes.h b/crypto_kem/mceliece8192128/clean/benes.h new file mode 100644 index 00000000..722bd771 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/benes.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_BENES_H +#define PQCLEAN_MCELIECE8192128_CLEAN_BENES_H +/* + This file is for Benes network related functions +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE8192128_CLEAN_apply_benes(uint8_t *r, const uint8_t *bits, int rev); +void PQCLEAN_MCELIECE8192128_CLEAN_support_gen(gf *s, const uint8_t *c); + +#endif + diff --git a/crypto_kem/mceliece8192128/clean/bm.c b/crypto_kem/mceliece8192128/clean/bm.c new file mode 100644 index 00000000..2210e567 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/bm.c @@ -0,0 +1,83 @@ +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ +#include "bm.h" + +#include "params.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +/* the Berlekamp-Massey algorithm */ +/* input: s, sequence of field elements */ +/* output: out, minimal polynomial of s */ +void PQCLEAN_MCELIECE8192128_CLEAN_bm(gf *out, gf *s) { + int i; + + uint16_t N = 0; + uint16_t L = 0; + uint16_t mle; + uint16_t mne; + + gf T[ SYS_T + 1 ]; + gf C[ SYS_T + 1 ]; + gf B[ SYS_T + 1 ]; + + gf b = 1, d, f; + + // + + for (i = 0; i < SYS_T + 1; i++) { + C[i] = B[i] = 0; + } + + B[1] = C[0] = 1; + + // + + for (N = 0; N < 2 * SYS_T; N++) { + d = 0; + + for (i = 0; i <= min(N, SYS_T); i++) { + d ^= PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(C[i], s[ N - i]); + } + + mne = d; + mne -= 1; + mne >>= 15; + mne -= 1; + mle = N; + mle -= 2 * L; + mle >>= 15; + mle -= 1; + mle &= mne; + + for (i = 0; i <= SYS_T; i++) { + T[i] = C[i]; + } + + f = PQCLEAN_MCELIECE8192128_CLEAN_gf_frac(b, d); + + for (i = 0; i <= SYS_T; i++) { + C[i] ^= PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(f, B[i]) & mne; + } + + L = (L & ~mle) | ((N + 1 - L) & mle); + + for (i = 0; i <= SYS_T; i++) { + B[i] = (B[i] & ~mle) | (T[i] & mle); + } + + b = (b & ~mle) | (d & mle); + + for (i = SYS_T; i >= 1; i--) { + B[i] = B[i - 1]; + } + B[0] = 0; + } + + for (i = 0; i <= SYS_T; i++) { + out[i] = C[ SYS_T - i ]; + } +} + diff --git a/crypto_kem/mceliece8192128/clean/bm.h b/crypto_kem/mceliece8192128/clean/bm.h new file mode 100644 index 00000000..e5acd06e --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/bm.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_BM_H +#define PQCLEAN_MCELIECE8192128_CLEAN_BM_H +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE8192128_CLEAN_bm(gf * /*out*/, gf * /*s*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/clean/controlbits.c b/crypto_kem/mceliece8192128/clean/controlbits.c new file mode 100644 index 00000000..4c493986 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE8192128_CLEAN_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE8192128_CLEAN_sort_63b(n / 2, x); + PQCLEAN_MCELIECE8192128_CLEAN_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE8192128_CLEAN_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece8192128/clean/controlbits.h b/crypto_kem/mceliece8192128/clean/controlbits.h new file mode 100644 index 00000000..6a9e5676 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_CONTROLBITS_H +#define PQCLEAN_MCELIECE8192128_CLEAN_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE8192128_CLEAN_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE8192128_CLEAN_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece8192128/clean/crypto_hash.h b/crypto_kem/mceliece8192128/clean/crypto_hash.h new file mode 100644 index 00000000..7ebe5cc5 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE8192128_CLEAN_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece8192128/clean/decrypt.c b/crypto_kem/mceliece8192128/clean/decrypt.c new file mode 100644 index 00000000..13513d1b --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/decrypt.c @@ -0,0 +1,90 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "gf.h" +#include "params.h" +#include "root.h" +#include "synd.h" +#include "util.h" + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE8192128_CLEAN_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i, w = 0; + uint16_t check; + + unsigned char r[ SYS_N / 8 ]; + + gf g[ SYS_T + 1 ]; + gf L[ SYS_N ]; + + gf s[ SYS_T * 2 ]; + gf s_cmp[ SYS_T * 2 ]; + gf locator[ SYS_T + 1 ]; + gf images[ SYS_N ]; + + gf t; + + // + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = c[i]; + } + for (i = SYND_BYTES; i < SYS_N / 8; i++) { + r[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE8192128_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + g[ SYS_T ] = 1; + + PQCLEAN_MCELIECE8192128_CLEAN_support_gen(L, sk); + + PQCLEAN_MCELIECE8192128_CLEAN_synd(s, g, L, r); + + PQCLEAN_MCELIECE8192128_CLEAN_bm(locator, s); + + PQCLEAN_MCELIECE8192128_CLEAN_root(images, locator, L); + + // + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + } + + for (i = 0; i < SYS_N; i++) { + t = PQCLEAN_MCELIECE8192128_CLEAN_gf_iszero(images[i]) & 1; + + e[ i / 8 ] |= t << (i % 8); + w += t; + + } + + PQCLEAN_MCELIECE8192128_CLEAN_synd(s_cmp, g, L, e); + + // + + check = (uint16_t)w; + check ^= SYS_T; + + for (i = 0; i < SYS_T * 2; i++) { + check |= s[i] ^ s_cmp[i]; + } + + check -= 1; + check >>= 15; + + return check ^ 1; +} + diff --git a/crypto_kem/mceliece8192128/clean/decrypt.h b/crypto_kem/mceliece8192128/clean/decrypt.h new file mode 100644 index 00000000..0b1f8fee --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_DECRYPT_H +#define PQCLEAN_MCELIECE8192128_CLEAN_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE8192128_CLEAN_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/clean/encrypt.c b/crypto_kem/mceliece8192128/clean/encrypt.c new file mode 100644 index 00000000..6be91350 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/encrypt.c @@ -0,0 +1,126 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +static inline uint8_t same_mask(uint16_t x, uint16_t y) { + uint32_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 31; + mask = -mask; + + return (uint8_t)mask; +} + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq; + + uint16_t ind[ SYS_T ]; + uint8_t *ind8 = (uint8_t *)ind; + uint8_t mask; + unsigned char val[ SYS_T ]; + + while (1) { + randombytes(ind8, sizeof(ind)); + // Copy to uint16_t ind in a little-endian way + for (i = 0; i < sizeof(ind); i += 2) { + ind[i / 2] = ind8[i + 1] << 8 | ind8[i]; + } + + for (i = 0; i < SYS_T; i++) { + ind[i] &= GFMASK; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = 1 << (ind[j] & 7); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = same_mask((uint16_t)i, (ind[j] >> 3)); + + e[i] |= val[j] & mask; + } + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + unsigned char b, row[SYS_N / 8]; + const unsigned char *pk_ptr = pk; + + int i, j; + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = 0; + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + row[j] = 0; + } + + for (j = 0; j < PK_ROW_BYTES; j++) { + row[ SYS_N / 8 - PK_ROW_BYTES + j ] = pk_ptr[j]; + } + + row[i / 8] |= 1 << (i % 8); + + b = 0; + for (j = 0; j < SYS_N / 8; j++) { + b ^= row[j] & e[j]; + } + + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] |= (b << (i % 8)); + + pk_ptr += PK_ROW_BYTES; + } +} + +void PQCLEAN_MCELIECE8192128_CLEAN_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece8192128/clean/encrypt.h b/crypto_kem/mceliece8192128/clean/encrypt.h new file mode 100644 index 00000000..1c5404dc --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_ENCRYPT_H +#define PQCLEAN_MCELIECE8192128_CLEAN_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE8192128_CLEAN_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/clean/gf.c b/crypto_kem/mceliece8192128/clean/gf.c new file mode 100644 index 00000000..9b4eb1cb --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/gf.c @@ -0,0 +1,210 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE8192128_CLEAN_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE8192128_CLEAN_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* input: field element in */ +/* return: (in^2)^2 */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: (in^2)*m */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: ((in^2)^2)*m */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE8192128_CLEAN_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +gf PQCLEAN_MCELIECE8192128_CLEAN_gf_inv(gf in) { + return PQCLEAN_MCELIECE8192128_CLEAN_gf_frac(in, ((gf) 1)); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE8192128_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(prod[i], (gf) 7682); + prod[i - SYS_T + 3] ^= PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(prod[i], (gf) 2159); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece8192128/clean/gf.h b/crypto_kem/mceliece8192128/clean/gf.h new file mode 100644 index 00000000..aecc97b9 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_GF_H +#define PQCLEAN_MCELIECE8192128_CLEAN_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE8192128_CLEAN_gf_iszero(gf a); +gf PQCLEAN_MCELIECE8192128_CLEAN_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(gf in0, gf in1); +gf PQCLEAN_MCELIECE8192128_CLEAN_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE8192128_CLEAN_gf_inv(gf in); +uint64_t PQCLEAN_MCELIECE8192128_CLEAN_gf_mul2(gf a, gf b0, gf b1); + +void PQCLEAN_MCELIECE8192128_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece8192128/clean/operations.c b/crypto_kem/mceliece8192128/clean/operations.c new file mode 100644 index 00000000..67d9acf5 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE8192128_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE8192128_CLEAN_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE8192128_CLEAN_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE8192128_CLEAN_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE8192128_CLEAN_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE8192128_CLEAN_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE8192128_CLEAN_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE8192128_CLEAN_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE8192128_CLEAN_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE8192128_CLEAN_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE8192128_CLEAN_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128/clean/params.h b/crypto_kem/mceliece8192128/clean/params.h new file mode 100644 index 00000000..7143b373 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_PARAMS_H +#define PQCLEAN_MCELIECE8192128_CLEAN_PARAMS_H + +#define GFBITS 13 +#define SYS_N 8192 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece8192128/clean/pk_gen.c b/crypto_kem/mceliece8192128/clean/pk_gen.c new file mode 100644 index 00000000..bdfaa2f8 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/pk_gen.c @@ -0,0 +1,144 @@ +/* + This file is for public-key generation +*/ + +#include + +#include "benes.h" +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "pk_gen.h" +#include "root.h" +#include "util.h" + +/* input: secret key sk */ +/* output: public key pk */ +int PQCLEAN_MCELIECE8192128_CLEAN_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) { + int i, j, k; + int row, c; + + uint64_t buf[ 1 << GFBITS ]; + + uint8_t mat[ GFBITS * SYS_T ][ SYS_N / 8 ]; + uint8_t mask; + uint8_t b; + + gf g[ SYS_T + 1 ]; // Goppa polynomial + gf L[ SYS_N ]; // support + gf inv[ SYS_N ]; + + // + + g[ SYS_T ] = 1; + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE8192128_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + + for (i = 0; i < (1 << GFBITS); i++) { + buf[i] = perm[i]; + buf[i] <<= 31; + buf[i] |= i; + } + + PQCLEAN_MCELIECE8192128_CLEAN_sort_63b(1 << GFBITS, buf); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = buf[i] & GFMASK; + } + for (i = 0; i < SYS_N; i++) { + L[i] = PQCLEAN_MCELIECE8192128_CLEAN_bitrev((gf)perm[i]); + } + + // filling the matrix + + PQCLEAN_MCELIECE8192128_CLEAN_root(inv, g, L); + + for (i = 0; i < SYS_N; i++) { + inv[i] = PQCLEAN_MCELIECE8192128_CLEAN_gf_inv(inv[i]); + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + mat[i][j] = 0; + } + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_N; j += 8) { + for (k = 0; k < GFBITS; k++) { + b = (inv[j + 7] >> k) & 1; + b <<= 1; + b |= (inv[j + 6] >> k) & 1; + b <<= 1; + b |= (inv[j + 5] >> k) & 1; + b <<= 1; + b |= (inv[j + 4] >> k) & 1; + b <<= 1; + b |= (inv[j + 3] >> k) & 1; + b <<= 1; + b |= (inv[j + 2] >> k) & 1; + b <<= 1; + b |= (inv[j + 1] >> k) & 1; + b <<= 1; + b |= (inv[j + 0] >> k) & 1; + + mat[ i * GFBITS + k ][ j / 8 ] = b; + } + } + + for (j = 0; j < SYS_N; j++) { + inv[j] = PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(inv[j], L[j]); + } + + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) { + for (j = 0; j < 8; j++) { + row = i * 8 + j; + + if (row >= GFBITS * SYS_T) { + break; + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] ^ mat[ k ][ i ]; + mask >>= j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < GFBITS * SYS_T; k++) { + if (k != row) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + } + + for (i = 0; i < PK_NROWS; i++) { + memcpy(pk + i * PK_ROW_BYTES, mat[i] + PK_NROWS / 8, PK_ROW_BYTES); + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128/clean/pk_gen.h b/crypto_kem/mceliece8192128/clean/pk_gen.h new file mode 100644 index 00000000..93ad8364 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_PK_GEN_H +#define PQCLEAN_MCELIECE8192128_CLEAN_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE8192128_CLEAN_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/clean/root.c b/crypto_kem/mceliece8192128/clean/root.c new file mode 100644 index 00000000..8c6c367c --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/root.c @@ -0,0 +1,33 @@ +/* + This file is for evaluating a polynomial at one or more field elements +*/ +#include "root.h" + +#include "params.h" + +/* input: polynomial f and field element a */ +/* return f(a) */ +gf PQCLEAN_MCELIECE8192128_CLEAN_eval(gf *f, gf a) { + int i; + gf r; + + r = f[ SYS_T ]; + + for (i = SYS_T - 1; i >= 0; i--) { + r = PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(r, a); + r = PQCLEAN_MCELIECE8192128_CLEAN_gf_add(r, f[i]); + } + + return r; +} + +/* input: polynomial f and list of field elements L */ +/* output: out = [ f(a) for a in L ] */ +void PQCLEAN_MCELIECE8192128_CLEAN_root(gf *out, gf *f, gf *L) { + int i; + + for (i = 0; i < SYS_N; i++) { + out[i] = PQCLEAN_MCELIECE8192128_CLEAN_eval(f, L[i]); + } +} + diff --git a/crypto_kem/mceliece8192128/clean/root.h b/crypto_kem/mceliece8192128/clean/root.h new file mode 100644 index 00000000..7b7c95d6 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/root.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_ROOT_H +#define PQCLEAN_MCELIECE8192128_CLEAN_ROOT_H +/* + This file is for evaluating a polynomial at one or more field elements +*/ + + +#include "gf.h" + +gf PQCLEAN_MCELIECE8192128_CLEAN_eval(gf * /*f*/, gf /*a*/); +void PQCLEAN_MCELIECE8192128_CLEAN_root(gf * /*out*/, gf * /*f*/, gf * /*L*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/clean/sk_gen.c b/crypto_kem/mceliece8192128/clean/sk_gen.c new file mode 100644 index 00000000..81cd0ed9 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE8192128_CLEAN_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE8192128_CLEAN_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE8192128_CLEAN_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE8192128_CLEAN_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE8192128_CLEAN_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE8192128_CLEAN_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128/clean/sk_gen.h b/crypto_kem/mceliece8192128/clean/sk_gen.h new file mode 100644 index 00000000..ed9d3757 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_SK_GEN_H +#define PQCLEAN_MCELIECE8192128_CLEAN_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE8192128_CLEAN_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE8192128_CLEAN_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/clean/synd.c b/crypto_kem/mceliece8192128/clean/synd.c new file mode 100644 index 00000000..fcfdeb79 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/synd.c @@ -0,0 +1,33 @@ +/* + This file is for syndrome computation +*/ + +#include "synd.h" + +#include "params.h" +#include "root.h" + + +/* input: Goppa polynomial f, support L, received word r */ +/* output: out, the syndrome of length 2t */ +void PQCLEAN_MCELIECE8192128_CLEAN_synd(gf *out, gf *f, gf *L, const unsigned char *r) { + int i, j; + gf e, e_inv, c; + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = 0; + } + + for (i = 0; i < SYS_N; i++) { + c = (r[i / 8] >> (i % 8)) & 1; + + e = PQCLEAN_MCELIECE8192128_CLEAN_eval(f, L[i]); + e_inv = PQCLEAN_MCELIECE8192128_CLEAN_gf_inv(PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(e, e)); + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = PQCLEAN_MCELIECE8192128_CLEAN_gf_add(out[j], PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(e_inv, c)); + e_inv = PQCLEAN_MCELIECE8192128_CLEAN_gf_mul(e_inv, L[i]); + } + } +} + diff --git a/crypto_kem/mceliece8192128/clean/synd.h b/crypto_kem/mceliece8192128/clean/synd.h new file mode 100644 index 00000000..6b51da54 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/synd.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_SYND_H +#define PQCLEAN_MCELIECE8192128_CLEAN_SYND_H +/* + This file is for syndrome computation +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE8192128_CLEAN_synd(gf * /*out*/, gf * /*f*/, gf * /*L*/, const unsigned char * /*r*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/clean/transpose.c b/crypto_kem/mceliece8192128/clean/transpose.c new file mode 100644 index 00000000..81372765 --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/transpose.c @@ -0,0 +1,42 @@ +/* + This file is for matrix transposition +*/ + +#include "transpose.h" + +#include + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE8192128_CLEAN_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + diff --git a/crypto_kem/mceliece8192128/clean/transpose.h b/crypto_kem/mceliece8192128/clean/transpose.h new file mode 100644 index 00000000..c7e220ad --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/transpose.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_TRANSPOSE_H +#define PQCLEAN_MCELIECE8192128_CLEAN_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + +void PQCLEAN_MCELIECE8192128_CLEAN_transpose_64x64(uint64_t * /*out*/, const uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/clean/util.c b/crypto_kem/mceliece8192128/clean/util.c new file mode 100644 index 00000000..e6b87fcd --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/util.c @@ -0,0 +1,67 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ + +#include "util.h" + +#include "params.h" + +void PQCLEAN_MCELIECE8192128_CLEAN_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE8192128_CLEAN_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE8192128_CLEAN_load4(const unsigned char *in) { + int i; + uint32_t ret = in[3]; + + for (i = 2; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +void PQCLEAN_MCELIECE8192128_CLEAN_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE8192128_CLEAN_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE8192128_CLEAN_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 3; +} + diff --git a/crypto_kem/mceliece8192128/clean/util.h b/crypto_kem/mceliece8192128/clean/util.h new file mode 100644 index 00000000..5394ba7d --- /dev/null +++ b/crypto_kem/mceliece8192128/clean/util.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE8192128_CLEAN_UTIL_H +#define PQCLEAN_MCELIECE8192128_CLEAN_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include + +void PQCLEAN_MCELIECE8192128_CLEAN_store2(unsigned char * /*dest*/, gf /*a*/); +uint16_t PQCLEAN_MCELIECE8192128_CLEAN_load2(const unsigned char * /*src*/); + +uint32_t PQCLEAN_MCELIECE8192128_CLEAN_load4(const unsigned char * /*in*/); + +void PQCLEAN_MCELIECE8192128_CLEAN_store8(unsigned char * /*out*/, uint64_t /*in*/); +uint64_t PQCLEAN_MCELIECE8192128_CLEAN_load8(const unsigned char * /*in*/); + +gf PQCLEAN_MCELIECE8192128_CLEAN_bitrev(gf /*a*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/sse/LICENSE b/crypto_kem/mceliece8192128/sse/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece8192128/sse/Makefile b/crypto_kem/mceliece8192128/sse/Makefile new file mode 100644 index 00000000..dde9c0f3 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/Makefile @@ -0,0 +1,37 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece8192128_sse.a + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c operations.c pk_gen.c sk_gen.c util.c vec128.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S update_asm.S \ + vec128_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h params.h \ + pk_gen.h sk_gen.h transpose.h util.h vec128.h \ + consts.inc scalars_2x.inc scalars_4x.inc powers.inc + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o operations.o pk_gen.o sk_gen.o util.o vec128.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + update_asm.o vec128_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mpopcnt -msse4.1 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece8192128/sse/aes256ctr.c b/crypto_kem/mceliece8192128/sse/aes256ctr.c new file mode 100644 index 00000000..ec13d89f --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE8192128_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece8192128/sse/aes256ctr.h b/crypto_kem/mceliece8192128/sse/aes256ctr.h new file mode 100644 index 00000000..24a122a8 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_AES256CTR_H +#define PQCLEAN_MCELIECE8192128_SSE_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE8192128_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece8192128/sse/api.h b/crypto_kem/mceliece8192128/sse/api.h new file mode 100644 index 00000000..8b5112c6 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_API_H +#define PQCLEAN_MCELIECE8192128_SSE_API_H + +#include + +#define PQCLEAN_MCELIECE8192128_SSE_CRYPTO_ALGNAME "Classic McEliece 8192128" +#define PQCLEAN_MCELIECE8192128_SSE_CRYPTO_PUBLICKEYBYTES 1357824 +#define PQCLEAN_MCELIECE8192128_SSE_CRYPTO_SECRETKEYBYTES 14080 +#define PQCLEAN_MCELIECE8192128_SSE_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE8192128_SSE_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE8192128_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE8192128_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE8192128_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece8192128/sse/benes.c b/crypto_kem/mceliece8192128/sse/benes.c new file mode 100644 index 00000000..97bae266 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE8192128_SSE_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE8192128_SSE_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(PQCLEAN_MCELIECE8192128_SSE_load8(ptr), PQCLEAN_MCELIECE8192128_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE8192128_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE8192128_SSE_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(PQCLEAN_MCELIECE8192128_SSE_load8(ptr), PQCLEAN_MCELIECE8192128_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE8192128_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE8192128_SSE_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE8192128_SSE_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE8192128_SSE_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE8192128_SSE_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE8192128_SSE_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece8192128/sse/benes.h b/crypto_kem/mceliece8192128/sse/benes.h new file mode 100644 index 00000000..19e9c880 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_BENES_H +#define PQCLEAN_MCELIECE8192128_SSE_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE8192128_SSE_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE8192128_SSE_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/sse/bm.c b/crypto_kem/mceliece8192128/sse/bm.c new file mode 100644 index 00000000..701128db --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/bm.c @@ -0,0 +1,208 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +extern gf PQCLEAN_MCELIECE8192128_SSE_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE8192128_SSE_update_asm(vec128 *, gf); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 *out, vec128 *in, uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE8192128_SSE_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE8192128_SSE_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE8192128_SSE_vec128_and(in[i], m0); + v1 = PQCLEAN_MCELIECE8192128_SSE_vec128_and(out[i], m1); + out[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_or(v0, v1); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE8192128_SSE_vec128_or(PQCLEAN_MCELIECE8192128_SSE_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE8192128_SSE_vec128_sll_2x(PQCLEAN_MCELIECE8192128_SSE_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE8192128_SSE_vec128_or(PQCLEAN_MCELIECE8192128_SSE_vec128_srl_2x(PQCLEAN_MCELIECE8192128_SSE_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE8192128_SSE_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE8192128_SSE_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE8192128_SSE_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE8192128_SSE_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE8192128_SSE_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE8192128_SSE_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE8192128_SSE_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE8192128_SSE_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE8192128_SSE_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128_SSE_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128_SSE_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE8192128_SSE_bm(vec128 *out, vec128 in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + vec128 dd[ GFBITS ], bb[ GFBITS ]; + vec128 B[ GFBITS ], C[ GFBITS ]; + vec128 B_tmp[ GFBITS ], C_tmp[ GFBITS ]; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[128], in[1]); + + C[0] = PQCLEAN_MCELIECE8192128_SSE_vec128_setzero(); + B[0] = PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0, one << 63); + + for (i = 1; i < GFBITS; i++) { + C[i] = B[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_setzero(); + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(prod, C, (vec128 *) interval); + PQCLEAN_MCELIECE8192128_SSE_update_asm(interval, coefs[N]); + d = PQCLEAN_MCELIECE8192128_SSE_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE8192128_SSE_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_setbits((d >> i) & 1); + bb[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(B_tmp, dd, B); + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(C_tmp, bb, C); + + vec128_cmov(B, C, mask); + PQCLEAN_MCELIECE8192128_SSE_update_asm(B, c0 & mask); + + for (i = 0; i < GFBITS; i++) { + C[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(B_tmp[i], C_tmp[i]); + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE8192128_SSE_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + out[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(out, out, C); +} + diff --git a/crypto_kem/mceliece8192128/sse/bm.h b/crypto_kem/mceliece8192128/sse/bm.h new file mode 100644 index 00000000..4ada489b --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/bm.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_BM_H +#define PQCLEAN_MCELIECE8192128_SSE_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE8192128_SSE_bm(vec128 * /*out*/, vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece8192128/sse/consts.S b/crypto_kem/mceliece8192128/sse/consts.S new file mode 100644 index 00000000..e9c9b1b4 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/consts.S @@ -0,0 +1,32 @@ +.data + +# not supported on macos +#.section .rodata +.globl PQCLEAN_MCELIECE8192128_SSE_MASK0_0 +.globl PQCLEAN_MCELIECE8192128_SSE_MASK0_1 +.globl PQCLEAN_MCELIECE8192128_SSE_MASK1_0 +.globl PQCLEAN_MCELIECE8192128_SSE_MASK1_1 +.globl PQCLEAN_MCELIECE8192128_SSE_MASK2_0 +.globl PQCLEAN_MCELIECE8192128_SSE_MASK2_1 +.globl PQCLEAN_MCELIECE8192128_SSE_MASK3_0 +.globl PQCLEAN_MCELIECE8192128_SSE_MASK3_1 +.globl PQCLEAN_MCELIECE8192128_SSE_MASK4_0 +.globl PQCLEAN_MCELIECE8192128_SSE_MASK4_1 +.globl PQCLEAN_MCELIECE8192128_SSE_MASK5_0 +.globl PQCLEAN_MCELIECE8192128_SSE_MASK5_1 + +.p2align 4 + +PQCLEAN_MCELIECE8192128_SSE_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE8192128_SSE_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE8192128_SSE_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE8192128_SSE_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE8192128_SSE_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE8192128_SSE_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE8192128_SSE_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE8192128_SSE_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE8192128_SSE_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE8192128_SSE_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE8192128_SSE_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE8192128_SSE_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece8192128/sse/consts.inc b/crypto_kem/mceliece8192128/sse/consts.inc new file mode 100644 index 00000000..3629e235 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/consts.inc @@ -0,0 +1,967 @@ +// 64 +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +// 128 +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), +}, +// 256 +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9669966969966996, 0X6996699696699669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +// 512 +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +// 1024 +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +// 2048 +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +// 4096 +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +} diff --git a/crypto_kem/mceliece8192128/sse/controlbits.c b/crypto_kem/mceliece8192128/sse/controlbits.c new file mode 100644 index 00000000..4ef3749d --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE8192128_SSE_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE8192128_SSE_sort_63b(n / 2, x); + PQCLEAN_MCELIECE8192128_SSE_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE8192128_SSE_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece8192128/sse/controlbits.h b/crypto_kem/mceliece8192128/sse/controlbits.h new file mode 100644 index 00000000..8fe83fa4 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_CONTROLBITS_H +#define PQCLEAN_MCELIECE8192128_SSE_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE8192128_SSE_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE8192128_SSE_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece8192128/sse/crypto_hash.h b/crypto_kem/mceliece8192128/sse/crypto_hash.h new file mode 100644 index 00000000..88567966 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE8192128_SSE_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece8192128/sse/decrypt.c b/crypto_kem/mceliece8192128/sse/decrypt.c new file mode 100644 index 00000000..9c6720d1 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/decrypt.c @@ -0,0 +1,175 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec128 out[][GFBITS], vec128 inv[][GFBITS], const unsigned char *sk, vec128 *recv) { + int i, j; + + vec128 irr_int[ GFBITS ]; + vec128 eval[64][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE8192128_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE8192128_SSE_fft(eval, irr_int); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE8192128_SSE_vec128_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE8192128_SSE_vec128_copy(inv[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128_SSE_vec128_inv(tmp, inv[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128_SSE_vec128_copy(inv[0], tmp); + + // + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE8192128_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static void scaling_inv(vec128 out[][GFBITS], vec128 inv[][GFBITS], vec128 *recv) { + int i, j; + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE8192128_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + + recv[0] = PQCLEAN_MCELIECE8192128_SSE_vec128_setbits(0); + + for (i = 1; i < 64; i++) { + recv[i] = recv[0]; + } + + for (i = 0; i < SYND_BYTES / 16; i++) { + recv[i] = PQCLEAN_MCELIECE8192128_SSE_load16(s + i * 16); + } +} + +static uint16_t weight(vec128 *v) { + uint16_t i, w = 0; + + for (i = 0; i < 64; i++) { + w += (uint16_t)_mm_popcnt_u64(PQCLEAN_MCELIECE8192128_SSE_vec128_extract(v[i], 0) ); + w += (uint16_t)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128_SSE_vec128_extract(v[i], 1) ); + } + + return w; +} + +static uint16_t synd_cmp(vec128 s0[][ GFBITS ], vec128 s1[][ GFBITS ]) { + int i, j; + vec128 diff; + + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_or(PQCLEAN_MCELIECE8192128_SSE_vec128_xor(s0[0][0], s1[0][0]), + PQCLEAN_MCELIECE8192128_SSE_vec128_xor(s0[1][0], s1[1][0])); + + for (i = 0; i < 2; i++) { + for (j = 1; j < GFBITS; j++) { + diff = PQCLEAN_MCELIECE8192128_SSE_vec128_or(diff, PQCLEAN_MCELIECE8192128_SSE_vec128_xor(s0[i][j], s1[i][j])); + } + } + + return (uint16_t)PQCLEAN_MCELIECE8192128_SSE_vec128_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE8192128_SSE_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec128 inv[ 64 ][ GFBITS ]; + vec128 scaled[ 64 ][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + + vec128 error[ 64 ]; + + vec128 s_priv[ 2 ][ GFBITS ]; + vec128 s_priv_cmp[ 2 ][ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv[ 64 ]; + vec128 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE8192128_SSE_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE8192128_SSE_benes(recv, bits_int, 1); + + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE8192128_SSE_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE8192128_SSE_bm(locator, s_priv); + + PQCLEAN_MCELIECE8192128_SSE_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE8192128_SSE_vec128_setbits(1); + + for (i = 0; i < 64; i++) { + error[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_or_reduce(eval[i]); + error[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(error[i], allone); + } + + check_weight = weight(error) ^ SYS_T; + check_weight -= 1; + check_weight >>= 15; + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE8192128_SSE_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE8192128_SSE_benes(error, bits_int, 0); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE8192128_SSE_store16(e + i * 16, error[i]); + } + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece8192128/sse/decrypt.h b/crypto_kem/mceliece8192128/sse/decrypt.h new file mode 100644 index 00000000..1728ffa0 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_DECRYPT_H +#define PQCLEAN_MCELIECE8192128_SSE_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE8192128_SSE_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/sse/encrypt.c b/crypto_kem/mceliece8192128/sse/encrypt.c new file mode 100644 index 00000000..806be006 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/encrypt.c @@ -0,0 +1,84 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE8192128_SSE_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq; + + uint16_t ind[ SYS_T ]; + uint64_t e_int[ SYS_N / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *)ind, sizeof(ind)); + + for (i = 0; i < SYS_T; i++) { + ind[i] &= GFMASK; + } + + for (i = 0; i < SYS_T; i++) { + ind[i] &= GFMASK; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind[j] & 63); + } + + for (i = 0; i < SYS_N / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < SYS_N / 64; i++) { + PQCLEAN_MCELIECE8192128_SSE_store8(e + i * 8, e_int[i]); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE8192128_SSE_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE8192128_SSE_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece8192128/sse/encrypt.h b/crypto_kem/mceliece8192128/sse/encrypt.h new file mode 100644 index 00000000..a9b7e6b0 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_ENCRYPT_H +#define PQCLEAN_MCELIECE8192128_SSE_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE8192128_SSE_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/sse/fft.c b/crypto_kem/mceliece8192128/sse/fft.c new file mode 100644 index 00000000..e4ce86d1 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/fft.c @@ -0,0 +1,243 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec128.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE8192128_SSE_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE8192128_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE8192128_SSE_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE8192128_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(in, in, s[j]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec128 out[][ GFBITS ], const vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec128 t[ GFBITS ]; + vec128 pre[8][ GFBITS ]; + vec128 buf[64]; + + uint64_t v0, v1; + uint64_t consts_ptr = 1; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec128 powers[ 64 ][ GFBITS ] = { +#include "powers.inc" + }; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre[i + 0][j] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_low(tmp[j], tmp[j]); + pre[i + 1][j] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(PQCLEAN_MCELIECE8192128_SSE_vec128_extract(in[i], 0), + PQCLEAN_MCELIECE8192128_SSE_vec128_extract(in[i], 0) ^ PQCLEAN_MCELIECE8192128_SSE_vec128_extract(pre[6][i], 0)); + + buf[1] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[0], pre[0][i]); + buf[16] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[0], pre[4][i]); + buf[3] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[1], pre[1][i]); + buf[48] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[16], pre[5][i]); + buf[49] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[48], pre[0][i]); + buf[2] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[0], pre[1][i]); + buf[51] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[49], pre[1][i]); + buf[6] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[2], pre[2][i]); + buf[50] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[51], pre[0][i]); + buf[7] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[6], pre[0][i]); + buf[54] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[50], pre[2][i]); + buf[5] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[7], pre[1][i]); + buf[55] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[54], pre[0][i]); + buf[53] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[55], pre[1][i]); + buf[4] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[0], pre[2][i]); + buf[52] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[53], pre[0][i]); + buf[12] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[4], pre[3][i]); + buf[60] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[52], pre[3][i]); + buf[13] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[12], pre[0][i]); + buf[61] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[60], pre[0][i]); + buf[15] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[13], pre[1][i]); + buf[63] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[61], pre[1][i]); + buf[14] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[15], pre[0][i]); + buf[62] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[63], pre[0][i]); + buf[10] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[14], pre[2][i]); + buf[58] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[62], pre[2][i]); + buf[11] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[10], pre[0][i]); + buf[59] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[58], pre[0][i]); + buf[9] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[11], pre[1][i]); + buf[57] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[59], pre[1][i]); + buf[56] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[57], pre[0][i]); + buf[8] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[0], pre[3][i]); + buf[40] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[56], pre[4][i]); + buf[24] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[8], pre[4][i]); + buf[41] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[40], pre[0][i]); + buf[25] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[24], pre[0][i]); + buf[43] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[41], pre[1][i]); + buf[27] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[25], pre[1][i]); + buf[42] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[43], pre[0][i]); + buf[26] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[27], pre[0][i]); + buf[46] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[42], pre[2][i]); + buf[30] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[26], pre[2][i]); + buf[47] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[46], pre[0][i]); + buf[31] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[30], pre[0][i]); + buf[45] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[47], pre[1][i]); + buf[29] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[31], pre[1][i]); + buf[44] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[45], pre[0][i]); + buf[28] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[29], pre[0][i]); + buf[36] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[44], pre[3][i]); + buf[20] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[28], pre[3][i]); + buf[37] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[36], pre[0][i]); + buf[21] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[20], pre[0][i]); + buf[39] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[37], pre[1][i]); + buf[23] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[21], pre[1][i]); + buf[38] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[39], pre[0][i]); + buf[22] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[23], pre[0][i]); + buf[34] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[38], pre[2][i]); + buf[18] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[22], pre[2][i]); + buf[35] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[34], pre[0][i]); + buf[19] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[18], pre[0][i]); + buf[33] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[35], pre[1][i]); + buf[17] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[19], pre[1][i]); + buf[32] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[33], pre[0][i]); + + PQCLEAN_MCELIECE8192128_SSE_transpose_64x128_sp(buf); + + for (j = 0; j < 64; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 0; i <= 5; i++) { + s = 1 << i; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(tmp, out[k + s], (vec128 *) consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(out[k ][b], tmp[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(out[k + s][b], out[k][b]); + } + } + } + + consts_ptr += (1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 64; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(out[i][b], powers[i][b]); + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE8192128_SSE_fft(vec128 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece8192128/sse/fft.h b/crypto_kem/mceliece8192128/sse/fft.h new file mode 100644 index 00000000..f0d18300 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_FFT_H +#define PQCLEAN_MCELIECE8192128_SSE_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include + +void PQCLEAN_MCELIECE8192128_SSE_fft(vec128 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/sse/fft_tr.c b/crypto_kem/mceliece8192128/sse/fft_tr.c new file mode 100644 index 00000000..4f406ada --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/fft_tr.c @@ -0,0 +1,338 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec128 in[][ GFBITS ]) { + int i, j, k; + vec128 t, x0, x1; + + uint64_t v0, v1, v2, v3; + + const vec128 mask[6][2] = { + { + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec128 s[6][2][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(in[1], in[1], s[j][1]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE8192128_SSE_vec128_and(in[0][i], mask[k][0]); + t = PQCLEAN_MCELIECE8192128_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE8192128_SSE_vec128_and(in[0][i], mask[k][1]); + t = PQCLEAN_MCELIECE8192128_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE8192128_SSE_vec128_and(in[1][i], mask[k][0]); + t = PQCLEAN_MCELIECE8192128_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[1][i], t); + + t = PQCLEAN_MCELIECE8192128_SSE_vec128_and(in[1][i], mask[k][1]); + t = PQCLEAN_MCELIECE8192128_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[1][i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + x0 = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_low(in[0][i], in[1][i]); + x1 = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_high(in[0][i], in[1][i]); + + x1 = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(x1, PQCLEAN_MCELIECE8192128_SSE_vec128_srl_2x(x0, 32)); + x1 = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(x1, PQCLEAN_MCELIECE8192128_SSE_vec128_sll_2x(x1, 32)); + + in[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_low(x0, x1); + in[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_high(x0, x1); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(in[0][i], 0); + v1 = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(in[0][i], 1); + v2 = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(in[1][i], 0); + v3 = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(in[1][i], 1); + + v3 ^= v2 ^= v1; + + in[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(v0, v1); + in[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(v2, v3); + } + + } +} + +static void butterflies_tr(vec128 out[][ GFBITS ], vec128 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec128 tmp0[ GFBITS ]; + vec128 tmp1[ GFBITS ]; + vec128 tmp[ GFBITS ]; + + vec128 pre[ 6 ][ GFBITS ]; + vec128 buf[ 64 ]; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 64; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 5; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[k][b], in[k + s][b]); + } + + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[k + s][b], tmp[b]); + } + } + } + } + + for (j = 0; j < 64; j += 2) { + for (i = 0; i < GFBITS; i++) { + tmp0[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_low(in[j][i], in[j + 1][i]); + tmp1[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_high(in[j][i], in[j + 1][i]); + } + + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(tmp0[b], tmp1[b]); + } + + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(tmp, tmp0, consts[0]); + + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(tmp1[b], tmp[b]); + } + + for (i = 0; i < GFBITS; i++) { + in[j + 0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_low(tmp0[i], tmp1[i]); + in[j + 1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_high(tmp0[i], tmp1[i]); + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 64; k++) { + buf[k] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE8192128_SSE_transpose_64x128_sp(buf); + + pre[0][i] = buf[32]; + buf[33] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[33], buf[32]); + pre[1][i] = buf[33]; + buf[35] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[35], buf[33]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[35]); + buf[34] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[34], buf[35]); + pre[2][i] = buf[34]; + buf[38] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[38], buf[34]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[38]); + buf[39] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[39], buf[38]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[39]); + buf[37] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[37], buf[39]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[37]); + buf[36] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[36], buf[37]); + pre[3][i] = buf[36]; + buf[44] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[44], buf[36]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[44]); + buf[45] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[45], buf[44]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[45]); + buf[47] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[47], buf[45]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[47]); + buf[46] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[46], buf[47]); + pre[2][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[2][i], buf[46]); + buf[42] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[42], buf[46]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[42]); + buf[43] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[43], buf[42]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[43]); + buf[41] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[41], buf[43]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[41]); + buf[40] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[40], buf[41]); + pre[4][i] = buf[40]; + buf[56] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[56], buf[40]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[56]); + buf[57] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[57], buf[56]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[57]); + buf[59] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[59], buf[57]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[59]); + buf[58] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[58], buf[59]); + pre[2][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[2][i], buf[58]); + buf[62] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[62], buf[58]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[62]); + buf[63] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[63], buf[62]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[63]); + buf[61] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[61], buf[63]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[61]); + buf[60] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[60], buf[61]); + pre[3][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[3][i], buf[60]); + buf[52] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[52], buf[60]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[52]); + buf[53] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[53], buf[52]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[53]); + buf[55] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[55], buf[53]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[55]); + buf[54] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[54], buf[55]); + pre[2][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[2][i], buf[54]); + buf[50] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[50], buf[54]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[50]); + buf[51] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[51], buf[50]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[51]); + buf[49] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[49], buf[51]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[49]); + buf[48] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[48], buf[49]); + pre[5][i] = buf[48]; + buf[16] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[16], buf[48]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[16]); + buf[17] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[17], buf[16]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[17]); + buf[19] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[19], buf[17]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[19]); + buf[18] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[18], buf[19]); + pre[2][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[2][i], buf[18]); + buf[22] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[22], buf[18]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[22]); + buf[23] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[23], buf[22]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[23]); + buf[21] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[21], buf[23]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[21]); + buf[20] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[20], buf[21]); + pre[3][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[3][i], buf[20]); + buf[28] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[28], buf[20]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[28]); + buf[29] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[29], buf[28]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[29]); + buf[31] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[31], buf[29]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[31]); + buf[30] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[30], buf[31]); + pre[2][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[2][i], buf[30]); + buf[26] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[26], buf[30]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[26]); + buf[27] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[27], buf[26]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[27]); + buf[25] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[25], buf[27]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[25]); + buf[24] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[24], buf[25]); + pre[4][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[4][i], buf[24]); + buf[8] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[8], buf[24]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[8]); + buf[9] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[9], buf[8]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[9]); + buf[11] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[11], buf[9]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[11]); + buf[10] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[10], buf[11]); + pre[2][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[2][i], buf[10]); + buf[14] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[14], buf[10]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[14]); + buf[15] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[15], buf[14]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[15]); + buf[13] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[13], buf[15]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[13]); + buf[12] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[12], buf[13]); + pre[3][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[3][i], buf[12]); + buf[4] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[4], buf[12]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[4]); + buf[5] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[5], buf[4]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[5]); + buf[7] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[7], buf[5]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[7]); + buf[6] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[6], buf[7]); + pre[2][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[2][i], buf[6]); + buf[2] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[2], buf[6]); + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[2]); + buf[3] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[3], buf[2]); + pre[1][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[1][i], buf[3]); + buf[1] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[1], buf[3]); + + pre[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(pre[0][i], buf[1]); + out[0][i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(buf[0], buf[1]); + + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128_SSE_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(out[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128_SSE_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(tmp, pre[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out[1][b] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(out[1][b], tmp[b]); + } + } +} + +void PQCLEAN_MCELIECE8192128_SSE_fft_tr(vec128 out[][GFBITS], vec128 in[][ GFBITS ]) { + butterflies_tr(out, in); + + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece8192128/sse/fft_tr.h b/crypto_kem/mceliece8192128/sse/fft_tr.h new file mode 100644 index 00000000..3006975d --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_FFT_TR_H +#define PQCLEAN_MCELIECE8192128_SSE_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE8192128_SSE_fft_tr(vec128 /*out*/[][GFBITS], vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece8192128/sse/gf.c b/crypto_kem/mceliece8192128/sse/gf.c new file mode 100644 index 00000000..554e8316 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128_SSE_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE8192128_SSE_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE8192128_SSE_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE8192128_SSE_gf_inv(gf in) { + return PQCLEAN_MCELIECE8192128_SSE_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE8192128_SSE_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE8192128_SSE_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE8192128_SSE_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE8192128_SSE_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE8192128_SSE_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE8192128_SSE_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece8192128/sse/gf.h b/crypto_kem/mceliece8192128/sse/gf.h new file mode 100644 index 00000000..f4d50f07 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/gf.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_GF_H +#define PQCLEAN_MCELIECE8192128_SSE_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE8192128_SSE_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE8192128_SSE_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE8192128_SSE_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE8192128_SSE_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE8192128_SSE_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128_SSE_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/crypto_kem/mceliece8192128/sse/operations.c b/crypto_kem/mceliece8192128/sse/operations.c new file mode 100644 index 00000000..567e67fb --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE8192128_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE8192128_SSE_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE8192128_SSE_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE8192128_SSE_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE8192128_SSE_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE8192128_SSE_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE8192128_SSE_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE8192128_SSE_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE8192128_SSE_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE8192128_SSE_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE8192128_SSE_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128/sse/params.h b/crypto_kem/mceliece8192128/sse/params.h new file mode 100644 index 00000000..f316e2b7 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_PARAMS_H +#define PQCLEAN_MCELIECE8192128_SSE_PARAMS_H + +#define GFBITS 13 +#define SYS_N 8192 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece8192128/sse/pk_gen.c b/crypto_kem/mceliece8192128/sse/pk_gen.c new file mode 100644 index 00000000..e1a63605 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/pk_gen.c @@ -0,0 +1,268 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec128 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 128 + 0 * 64 + r] <<= 1; + out[i * 128 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 128 + 1 * 64 + r] <<= 1; + out[i * 128 + 1 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec128 out0[][GFBITS], vec128 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[2] = {0}; + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(u[0], u[1]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(u[0], u[1]); + } + } +} + +int PQCLEAN_MCELIECE8192128_SSE_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int i, j, k; + int row, c, d; + + uint64_t mat[ GFBITS * SYS_T ][ 128 ]; + uint64_t ops[ GFBITS * SYS_T ][ GFBITS * SYS_T / 64 ]; + + uint64_t mask; + + vec128 irr_int[ GFBITS ]; + + vec128 consts[ 64 ][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + vec128 prod[ 64 ][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ (SYS_N - GFBITS * SYS_T) / 64 ]; + + // compute the inverses + + PQCLEAN_MCELIECE8192128_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE8192128_SSE_fft(eval, irr_int); + + PQCLEAN_MCELIECE8192128_SSE_vec128_copy(prod[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128_SSE_vec128_inv(tmp, prod[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128_SSE_vec128_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE8192128_SSE_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < (GFBITS * SYS_T + 127) / 128; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < (GFBITS * SYS_T + 127) / 128; j++) { + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + ops[ row ][ c ] = 0; + } + } + } + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + ops[ row ][ i ] = 1; + ops[ row ][ i ] <<= j; + } + } + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (i = (GFBITS * SYS_T) / 64 - 1; i >= 0; i--) { + for (j = 63; j >= 0; j--) { + row = i * 64 + j; + + for (k = 0; k < row; k++) { + { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + } + } + + // apply the linear map to the non-systematic part + + for (j = (GFBITS * SYS_T + 127) / 128; j < 64; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = (GFBITS * SYS_T + 127) / 128; j < 64; j++) { + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE8192128_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + for (k = 0; k < (SYS_N - GFBITS * SYS_T) / 64; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + for (d = 0; d < 64; d++) { + mask = ops[ row ][ c ] >> d; + mask &= 1; + mask = -mask; + + for (k = 0; k < (SYS_N - GFBITS * SYS_T) / 64; k++) { + one_row[ k ] ^= mat[ c * 64 + d ][ k + (GFBITS * SYS_T) / 64 ] & mask; + } + } + } + + for (k = 0; k < (SYS_N - GFBITS * SYS_T) / 64; k++) { + PQCLEAN_MCELIECE8192128_SSE_store8(pk, one_row[ k ]); + pk += 8; + } + } + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece8192128/sse/pk_gen.h b/crypto_kem/mceliece8192128/sse/pk_gen.h new file mode 100644 index 00000000..052c55c9 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_PK_GEN_H +#define PQCLEAN_MCELIECE8192128_SSE_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE8192128_SSE_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/sse/powers.inc b/crypto_kem/mceliece8192128/sse/powers.inc new file mode 100644 index 00000000..231dc3ee --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/powers.inc @@ -0,0 +1,960 @@ +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +} diff --git a/crypto_kem/mceliece8192128/sse/scalars_2x.inc b/crypto_kem/mceliece8192128/sse/scalars_2x.inc new file mode 100644 index 00000000..086b83f1 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece8192128/sse/scalars_4x.inc b/crypto_kem/mceliece8192128/sse/scalars_4x.inc new file mode 100644 index 00000000..e3cff180 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/scalars_4x.inc @@ -0,0 +1,181 @@ +{{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF), +}}, +{{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FF00000000FF, 0X0000FF000000FFFF), +}}, +{{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), +}}, +{{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), +}} + diff --git a/crypto_kem/mceliece8192128/sse/sk_gen.c b/crypto_kem/mceliece8192128/sse/sk_gen.c new file mode 100644 index 00000000..8b0068d0 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE8192128_SSE_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE8192128_SSE_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE8192128_SSE_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE8192128_SSE_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE8192128_SSE_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE8192128_SSE_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE8192128_SSE_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE8192128_SSE_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128/sse/sk_gen.h b/crypto_kem/mceliece8192128/sse/sk_gen.h new file mode 100644 index 00000000..efe346cc --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_SK_GEN_H +#define PQCLEAN_MCELIECE8192128_SSE_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE8192128_SSE_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE8192128_SSE_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/sse/syndrome_asm.S b/crypto_kem/mceliece8192128/sse/syndrome_asm.S new file mode 100644 index 00000000..844913c6 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/syndrome_asm.S @@ -0,0 +1,1449 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: reg128 p + +# qhasm: reg128 e + +# qhasm: reg128 s + +# qhasm: int64 buf_ptr + +# qhasm: stack128 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128_SSE_syndrome_asm +.global PQCLEAN_MCELIECE8192128_SSE_syndrome_asm +_PQCLEAN_MCELIECE8192128_SSE_syndrome_asm: +PQCLEAN_MCELIECE8192128_SSE_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 1357008 +# asm 1: add $1357008,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1664 +# asm 1: mov $1664,>row=int64#5 +# asm 2: mov $1664,>row=%r8 +mov $1664,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,s=reg128#1 +# asm 2: movdqu 0(s=%xmm0 +movdqu 0(%rsi),%xmm0 + +# qhasm: e = mem128[ input_2 + 208 ] +# asm 1: movdqu 208(e=reg128#2 +# asm 2: movdqu 208(e=%xmm1 +movdqu 208(%rdx),%xmm1 + +# qhasm: s &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 16(p=%xmm1 +movdqu 16(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 224 ] +# asm 1: movdqu 224(e=reg128#3 +# asm 2: movdqu 224(e=%xmm2 +movdqu 224(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 32(p=%xmm1 +movdqu 32(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 240 ] +# asm 1: movdqu 240(e=reg128#3 +# asm 2: movdqu 240(e=%xmm2 +movdqu 240(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 48(p=%xmm1 +movdqu 48(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 256 ] +# asm 1: movdqu 256(e=reg128#3 +# asm 2: movdqu 256(e=%xmm2 +movdqu 256(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 64(p=%xmm1 +movdqu 64(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 272 ] +# asm 1: movdqu 272(e=reg128#3 +# asm 2: movdqu 272(e=%xmm2 +movdqu 272(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 80(p=%xmm1 +movdqu 80(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 288 ] +# asm 1: movdqu 288(e=reg128#3 +# asm 2: movdqu 288(e=%xmm2 +movdqu 288(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 96(p=%xmm1 +movdqu 96(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 304 ] +# asm 1: movdqu 304(e=reg128#3 +# asm 2: movdqu 304(e=%xmm2 +movdqu 304(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 112(p=%xmm1 +movdqu 112(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 320 ] +# asm 1: movdqu 320(e=reg128#3 +# asm 2: movdqu 320(e=%xmm2 +movdqu 320(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 128(p=%xmm1 +movdqu 128(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 336 ] +# asm 1: movdqu 336(e=reg128#3 +# asm 2: movdqu 336(e=%xmm2 +movdqu 336(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 144(p=%xmm1 +movdqu 144(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 352 ] +# asm 1: movdqu 352(e=reg128#3 +# asm 2: movdqu 352(e=%xmm2 +movdqu 352(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 160(p=%xmm1 +movdqu 160(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 368 ] +# asm 1: movdqu 368(e=reg128#3 +# asm 2: movdqu 368(e=%xmm2 +movdqu 368(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 176(p=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 384 ] +# asm 1: movdqu 384(e=reg128#3 +# asm 2: movdqu 384(e=%xmm2 +movdqu 384(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 192(p=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 400 ] +# asm 1: movdqu 400(e=reg128#3 +# asm 2: movdqu 400(e=%xmm2 +movdqu 400(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 208(p=%xmm1 +movdqu 208(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 416 ] +# asm 1: movdqu 416(e=reg128#3 +# asm 2: movdqu 416(e=%xmm2 +movdqu 416(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 224(p=%xmm1 +movdqu 224(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 432 ] +# asm 1: movdqu 432(e=reg128#3 +# asm 2: movdqu 432(e=%xmm2 +movdqu 432(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 240(p=%xmm1 +movdqu 240(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 448 ] +# asm 1: movdqu 448(e=reg128#3 +# asm 2: movdqu 448(e=%xmm2 +movdqu 448(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 256(p=%xmm1 +movdqu 256(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 464 ] +# asm 1: movdqu 464(e=reg128#3 +# asm 2: movdqu 464(e=%xmm2 +movdqu 464(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 272(p=%xmm1 +movdqu 272(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 480 ] +# asm 1: movdqu 480(e=reg128#3 +# asm 2: movdqu 480(e=%xmm2 +movdqu 480(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 288(p=%xmm1 +movdqu 288(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 496 ] +# asm 1: movdqu 496(e=reg128#3 +# asm 2: movdqu 496(e=%xmm2 +movdqu 496(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 304(p=%xmm1 +movdqu 304(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 512 ] +# asm 1: movdqu 512(e=reg128#3 +# asm 2: movdqu 512(e=%xmm2 +movdqu 512(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 320(p=%xmm1 +movdqu 320(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 528 ] +# asm 1: movdqu 528(e=reg128#3 +# asm 2: movdqu 528(e=%xmm2 +movdqu 528(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 336(p=%xmm1 +movdqu 336(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 544 ] +# asm 1: movdqu 544(e=reg128#3 +# asm 2: movdqu 544(e=%xmm2 +movdqu 544(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 352(p=%xmm1 +movdqu 352(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 560 ] +# asm 1: movdqu 560(e=reg128#3 +# asm 2: movdqu 560(e=%xmm2 +movdqu 560(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 368(p=%xmm1 +movdqu 368(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 576 ] +# asm 1: movdqu 576(e=reg128#3 +# asm 2: movdqu 576(e=%xmm2 +movdqu 576(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 384(p=%xmm1 +movdqu 384(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 592 ] +# asm 1: movdqu 592(e=reg128#3 +# asm 2: movdqu 592(e=%xmm2 +movdqu 592(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 400(p=%xmm1 +movdqu 400(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 608 ] +# asm 1: movdqu 608(e=reg128#3 +# asm 2: movdqu 608(e=%xmm2 +movdqu 608(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 416(p=%xmm1 +movdqu 416(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 624 ] +# asm 1: movdqu 624(e=reg128#3 +# asm 2: movdqu 624(e=%xmm2 +movdqu 624(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 432(p=%xmm1 +movdqu 432(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 640 ] +# asm 1: movdqu 640(e=reg128#3 +# asm 2: movdqu 640(e=%xmm2 +movdqu 640(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 448(p=%xmm1 +movdqu 448(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 656 ] +# asm 1: movdqu 656(e=reg128#3 +# asm 2: movdqu 656(e=%xmm2 +movdqu 656(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 464(p=%xmm1 +movdqu 464(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 672 ] +# asm 1: movdqu 672(e=reg128#3 +# asm 2: movdqu 672(e=%xmm2 +movdqu 672(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 480(p=%xmm1 +movdqu 480(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 688 ] +# asm 1: movdqu 688(e=reg128#3 +# asm 2: movdqu 688(e=%xmm2 +movdqu 688(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 496(p=%xmm1 +movdqu 496(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 704 ] +# asm 1: movdqu 704(e=reg128#3 +# asm 2: movdqu 704(e=%xmm2 +movdqu 704(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 512(p=%xmm1 +movdqu 512(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 720 ] +# asm 1: movdqu 720(e=reg128#3 +# asm 2: movdqu 720(e=%xmm2 +movdqu 720(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 528(p=%xmm1 +movdqu 528(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 736 ] +# asm 1: movdqu 736(e=reg128#3 +# asm 2: movdqu 736(e=%xmm2 +movdqu 736(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 544(p=%xmm1 +movdqu 544(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 752 ] +# asm 1: movdqu 752(e=reg128#3 +# asm 2: movdqu 752(e=%xmm2 +movdqu 752(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 560(p=%xmm1 +movdqu 560(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 768 ] +# asm 1: movdqu 768(e=reg128#3 +# asm 2: movdqu 768(e=%xmm2 +movdqu 768(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 576(p=%xmm1 +movdqu 576(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 784 ] +# asm 1: movdqu 784(e=reg128#3 +# asm 2: movdqu 784(e=%xmm2 +movdqu 784(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 592(p=%xmm1 +movdqu 592(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 800 ] +# asm 1: movdqu 800(e=reg128#3 +# asm 2: movdqu 800(e=%xmm2 +movdqu 800(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 608(p=%xmm1 +movdqu 608(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 816 ] +# asm 1: movdqu 816(e=reg128#3 +# asm 2: movdqu 816(e=%xmm2 +movdqu 816(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 624(p=%xmm1 +movdqu 624(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 832 ] +# asm 1: movdqu 832(e=reg128#3 +# asm 2: movdqu 832(e=%xmm2 +movdqu 832(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 640(p=%xmm1 +movdqu 640(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 848 ] +# asm 1: movdqu 848(e=reg128#3 +# asm 2: movdqu 848(e=%xmm2 +movdqu 848(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 656(p=%xmm1 +movdqu 656(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 864 ] +# asm 1: movdqu 864(e=reg128#3 +# asm 2: movdqu 864(e=%xmm2 +movdqu 864(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 672(p=%xmm1 +movdqu 672(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 880 ] +# asm 1: movdqu 880(e=reg128#3 +# asm 2: movdqu 880(e=%xmm2 +movdqu 880(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 688(p=%xmm1 +movdqu 688(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 896 ] +# asm 1: movdqu 896(e=reg128#3 +# asm 2: movdqu 896(e=%xmm2 +movdqu 896(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 704(p=%xmm1 +movdqu 704(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 912 ] +# asm 1: movdqu 912(e=reg128#3 +# asm 2: movdqu 912(e=%xmm2 +movdqu 912(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 720(p=%xmm1 +movdqu 720(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 928 ] +# asm 1: movdqu 928(e=reg128#3 +# asm 2: movdqu 928(e=%xmm2 +movdqu 928(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 736(p=%xmm1 +movdqu 736(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 944 ] +# asm 1: movdqu 944(e=reg128#3 +# asm 2: movdqu 944(e=%xmm2 +movdqu 944(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 752(p=%xmm1 +movdqu 752(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 960 ] +# asm 1: movdqu 960(e=reg128#3 +# asm 2: movdqu 960(e=%xmm2 +movdqu 960(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 768(p=%xmm1 +movdqu 768(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 976 ] +# asm 1: movdqu 976(e=reg128#3 +# asm 2: movdqu 976(e=%xmm2 +movdqu 976(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 784(p=%xmm1 +movdqu 784(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 992 ] +# asm 1: movdqu 992(e=reg128#3 +# asm 2: movdqu 992(e=%xmm2 +movdqu 992(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 800(p=%xmm1 +movdqu 800(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 1008 ] +# asm 1: movdqu 1008(e=reg128#3 +# asm 2: movdqu 1008(e=%xmm2 +movdqu 1008(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand buf=stack128#1 +# asm 2: movdqa buf=0(%rsp) +movdqa %xmm0,0(%rsp) + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#6 +# asm 2: movq 0(b64=%r9 +movq 0(%rcx),%r9 + +# qhasm: c_all = count(b64) +# asm 1: popcnt c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 8 ] +# asm 1: movq 8(b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,s=reg128#1 +# asm 2: movdqu 0(s=%xmm0 +movdqu 0(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(e=reg128#2 +# asm 2: movdqu 0(e=%xmm1 +movdqu 0(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 16(s=%xmm0 +movdqu 16(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 16 ] +# asm 1: movdqu 16(e=reg128#2 +# asm 2: movdqu 16(e=%xmm1 +movdqu 16(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 32(s=%xmm0 +movdqu 32(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 32 ] +# asm 1: movdqu 32(e=reg128#2 +# asm 2: movdqu 32(e=%xmm1 +movdqu 32(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 48(s=%xmm0 +movdqu 48(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 48 ] +# asm 1: movdqu 48(e=reg128#2 +# asm 2: movdqu 48(e=%xmm1 +movdqu 48(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 64(s=%xmm0 +movdqu 64(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 64 ] +# asm 1: movdqu 64(e=reg128#2 +# asm 2: movdqu 64(e=%xmm1 +movdqu 64(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 80(s=%xmm0 +movdqu 80(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 80 ] +# asm 1: movdqu 80(e=reg128#2 +# asm 2: movdqu 80(e=%xmm1 +movdqu 80(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 96(s=%xmm0 +movdqu 96(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 96 ] +# asm 1: movdqu 96(e=reg128#2 +# asm 2: movdqu 96(e=%xmm1 +movdqu 96(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 112(s=%xmm0 +movdqu 112(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 112 ] +# asm 1: movdqu 112(e=reg128#2 +# asm 2: movdqu 112(e=%xmm1 +movdqu 112(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 128(s=%xmm0 +movdqu 128(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 128 ] +# asm 1: movdqu 128(e=reg128#2 +# asm 2: movdqu 128(e=%xmm1 +movdqu 128(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 144(s=%xmm0 +movdqu 144(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 144 ] +# asm 1: movdqu 144(e=reg128#2 +# asm 2: movdqu 144(e=%xmm1 +movdqu 144(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 160(s=%xmm0 +movdqu 160(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 160 ] +# asm 1: movdqu 160(e=reg128#2 +# asm 2: movdqu 160(e=%xmm1 +movdqu 160(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 176(s=%xmm0 +movdqu 176(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 176 ] +# asm 1: movdqu 176(e=reg128#2 +# asm 2: movdqu 176(e=%xmm1 +movdqu 176(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 192(s=%xmm0 +movdqu 192(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 192 ] +# asm 1: movdqu 192(e=reg128#2 +# asm 2: movdqu 192(e=%xmm1 +movdqu 192(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE8192128_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE8192128_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE8192128_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE8192128_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE8192128_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE8192128_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE8192128_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE8192128_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE8192128_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE8192128_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE8192128_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE8192128_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE8192128_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE8192128_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE8192128_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE8192128_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE8192128_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE8192128_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE8192128_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE8192128_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE8192128_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE8192128_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#3 +# asm 2: movq 0(s0=%rdx +movq 0(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#4 +# asm 2: movq 8(s1=%rcx +movq 8(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 16(s0=%rdx +movq 16(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(s1=int64#4 +# asm 2: movq 24(s1=%rcx +movq 24(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 32(s0=%rdx +movq 32(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(s1=int64#4 +# asm 2: movq 40(s1=%rcx +movq 40(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 48(s0=%rdx +movq 48(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(s1=int64#4 +# asm 2: movq 56(s1=%rcx +movq 56(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 64(s0=%rdx +movq 64(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(s1=int64#4 +# asm 2: movq 72(s1=%rcx +movq 72(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 80(s0=%rdx +movq 80(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(s1=int64#4 +# asm 2: movq 88(s1=%rcx +movq 88(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 96(s0=%rdx +movq 96(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(s1=int64#4 +# asm 2: movq 104(s1=%rcx +movq 104(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 112(s0=%rdx +movq 112(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(s1=int64#4 +# asm 2: movq 120(s1=%rcx +movq 120(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 128(s0=%rdx +movq 128(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(s1=int64#4 +# asm 2: movq 136(s1=%rcx +movq 136(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 144(s0=%rdx +movq 144(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(s1=int64#4 +# asm 2: movq 152(s1=%rcx +movq 152(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 160(s0=%rdx +movq 160(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(s1=int64#4 +# asm 2: movq 168(s1=%rcx +movq 168(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 176(s0=%rdx +movq 176(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(s1=int64#4 +# asm 2: movq 184(s1=%rcx +movq 184(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 192(s0=%rdx +movq 192(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(s1=int64#4 +# asm 2: movq 200(s1=%rcx +movq 200(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE8192128_SSE_store_i(unsigned char *out, uint64_t in, int i) { + for (int j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE8192128_SSE_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE8192128_SSE_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE8192128_SSE_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE8192128_SSE_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v0 = 0, v1 = 0; + uint16_t irr[ SYS_T ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE8192128_SSE_load2(in + i * 2); + irr[i] &= GFMASK; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 63; j >= 0; j--) { + v0 <<= 1; + v1 <<= 1; + v0 |= (irr[j] >> i) & 1; + v1 |= (irr[j + 64] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(v0, v1); + } +} + +void PQCLEAN_MCELIECE8192128_SSE_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE8192128_SSE_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE8192128_SSE_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE8192128_SSE_vec128_set2x( PQCLEAN_MCELIECE8192128_SSE_load8(in), PQCLEAN_MCELIECE8192128_SSE_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE8192128_SSE_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE8192128_SSE_store8(out + 0, PQCLEAN_MCELIECE8192128_SSE_vec128_extract(in, 0)); + PQCLEAN_MCELIECE8192128_SSE_store8(out + 8, PQCLEAN_MCELIECE8192128_SSE_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece8192128/sse/util.h b/crypto_kem/mceliece8192128/sse/util.h new file mode 100644 index 00000000..ddc83308 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_UTIL_H +#define PQCLEAN_MCELIECE8192128_SSE_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE8192128_SSE_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE8192128_SSE_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE8192128_SSE_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE8192128_SSE_load4(const unsigned char *src); +void PQCLEAN_MCELIECE8192128_SSE_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE8192128_SSE_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE8192128_SSE_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE8192128_SSE_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE8192128_SSE_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece8192128/sse/vec128.c b/crypto_kem/mceliece8192128/sse/vec128.c new file mode 100644 index 00000000..aea6bfd6 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/vec128.c @@ -0,0 +1,152 @@ +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + +#include "vec128.h" + +#include "params.h" + + +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE8192128_SSE_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE8192128_SSE_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE8192128_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE8192128_SSE_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE8192128_SSE_vec128_mul_asm(h, f, g, 16); +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE8192128_SSE_vec128_sq(vec128 *out, const vec128 *in) { + int i; + vec128 result[GFBITS], t; + + t = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[11], in[12]); + + result[0] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[0], in[11]); + result[1] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[7], t); + result[2] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[1], in[7]); + result[3] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[8], t); + result[4] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[2], in[7]); + result[4] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(result[4], in[8]); + result[4] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(result[4], t); + result[5] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[7], in[9]); + result[6] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[3], in[8]); + result[6] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(result[6], in[9]); + result[6] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(result[6], in[12]); + result[7] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[8], in[10]); + result[8] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[4], in[9]); + result[8] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(result[8], in[10]); + result[9] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[9], in[11]); + result[10] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[5], in[10]); + result[10] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(result[10], in[11]); + result[11] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[10], in[12]); + result[12] = PQCLEAN_MCELIECE8192128_SSE_vec128_xor(in[6], t); + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE8192128_SSE_vec128_inv(vec128 *out, const vec128 *in) { + vec128 tmp_11[ GFBITS ]; + vec128 tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE8192128_SSE_vec128_copy(out, in); + + PQCLEAN_MCELIECE8192128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE8192128_SSE_vec128_sq(out, tmp_11); + PQCLEAN_MCELIECE8192128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE8192128_SSE_vec128_sq(out, tmp_1111); + PQCLEAN_MCELIECE8192128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE8192128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128_SSE_vec128_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE8192128_SSE_vec128_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece8192128/sse/vec128.h b/crypto_kem/mceliece8192128/sse/vec128.h new file mode 100644 index 00000000..ceab047f --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/vec128.h @@ -0,0 +1,43 @@ +#ifndef PQCLEAN_MCELIECE8192128_SSE_VEC128_H +#define PQCLEAN_MCELIECE8192128_SSE_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE8192128_SSE_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE8192128_SSE_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE8192128_SSE_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE8192128_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE8192128_SSE_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE8192128_SSE_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +void PQCLEAN_MCELIECE8192128_SSE_vec128_sq(vec128 *out, const vec128 *in); +void PQCLEAN_MCELIECE8192128_SSE_vec128_inv(vec128 *out, const vec128 *in); +#endif diff --git a/crypto_kem/mceliece8192128/sse/vec128_mul_asm.S b/crypto_kem/mceliece8192128/sse/vec128_mul_asm.S new file mode 100644 index 00000000..3ae48af6 --- /dev/null +++ b/crypto_kem/mceliece8192128/sse/vec128_mul_asm.S @@ -0,0 +1,2127 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 a0 + +# qhasm: reg128 a1 + +# qhasm: reg128 a2 + +# qhasm: reg128 a3 + +# qhasm: reg128 a4 + +# qhasm: reg128 a5 + +# qhasm: reg128 a6 + +# qhasm: reg128 a7 + +# qhasm: reg128 a8 + +# qhasm: reg128 a9 + +# qhasm: reg128 a10 + +# qhasm: reg128 a11 + +# qhasm: reg128 a12 + +# qhasm: reg128 b0 + +# qhasm: reg128 b1 + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 r5 + +# qhasm: reg128 r6 + +# qhasm: reg128 r7 + +# qhasm: reg128 r8 + +# qhasm: reg128 r9 + +# qhasm: reg128 r10 + +# qhasm: reg128 r11 + +# qhasm: reg128 r12 + +# qhasm: reg128 r13 + +# qhasm: reg128 r14 + +# qhasm: reg128 r15 + +# qhasm: reg128 r16 + +# qhasm: reg128 r17 + +# qhasm: reg128 r18 + +# qhasm: reg128 r19 + +# qhasm: reg128 r20 + +# qhasm: reg128 r21 + +# qhasm: reg128 r22 + +# qhasm: reg128 r23 + +# qhasm: reg128 r24 + +# qhasm: reg128 r + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128_SSE_vec128_mul_asm +.global PQCLEAN_MCELIECE8192128_SSE_vec128_mul_asm +_PQCLEAN_MCELIECE8192128_SSE_vec128_mul_asm: +PQCLEAN_MCELIECE8192128_SSE_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(b0=reg128#1 +# asm 2: movdqu 0(b0=%xmm0 +movdqu 0(%rdx),%xmm0 + +# qhasm: a12 = mem128[ input_1 + 192 ] +# asm 1: movdqu 192(a12=reg128#2 +# asm 2: movdqu 192(a12=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg128#3 +# asm 2: vpand r12=%xmm2 +vpand %xmm0,%xmm1,%xmm2 + +# qhasm: r13 = a12 & mem128[input_2 + 16] +# asm 1: vpand 16(r13=reg128#4 +# asm 2: vpand 16(r13=%xmm3 +vpand 16(%rdx),%xmm1,%xmm3 + +# qhasm: r14 = a12 & mem128[input_2 + 32] +# asm 1: vpand 32(r14=reg128#5 +# asm 2: vpand 32(r14=%xmm4 +vpand 32(%rdx),%xmm1,%xmm4 + +# qhasm: r15 = a12 & mem128[input_2 + 48] +# asm 1: vpand 48(r15=reg128#6 +# asm 2: vpand 48(r15=%xmm5 +vpand 48(%rdx),%xmm1,%xmm5 + +# qhasm: r16 = a12 & mem128[input_2 + 64] +# asm 1: vpand 64(r16=reg128#7 +# asm 2: vpand 64(r16=%xmm6 +vpand 64(%rdx),%xmm1,%xmm6 + +# qhasm: r17 = a12 & mem128[input_2 + 80] +# asm 1: vpand 80(r17=reg128#8 +# asm 2: vpand 80(r17=%xmm7 +vpand 80(%rdx),%xmm1,%xmm7 + +# qhasm: r18 = a12 & mem128[input_2 + 96] +# asm 1: vpand 96(r18=reg128#9 +# asm 2: vpand 96(r18=%xmm8 +vpand 96(%rdx),%xmm1,%xmm8 + +# qhasm: r19 = a12 & mem128[input_2 + 112] +# asm 1: vpand 112(r19=reg128#10 +# asm 2: vpand 112(r19=%xmm9 +vpand 112(%rdx),%xmm1,%xmm9 + +# qhasm: r20 = a12 & mem128[input_2 + 128] +# asm 1: vpand 128(r20=reg128#11 +# asm 2: vpand 128(r20=%xmm10 +vpand 128(%rdx),%xmm1,%xmm10 + +# qhasm: r21 = a12 & mem128[input_2 + 144] +# asm 1: vpand 144(r21=reg128#12 +# asm 2: vpand 144(r21=%xmm11 +vpand 144(%rdx),%xmm1,%xmm11 + +# qhasm: r22 = a12 & mem128[input_2 + 160] +# asm 1: vpand 160(r22=reg128#13 +# asm 2: vpand 160(r22=%xmm12 +vpand 160(%rdx),%xmm1,%xmm12 + +# qhasm: r23 = a12 & mem128[input_2 + 176] +# asm 1: vpand 176(r23=reg128#14 +# asm 2: vpand 176(r23=%xmm13 +vpand 176(%rdx),%xmm1,%xmm13 + +# qhasm: r24 = a12 & mem128[input_2 + 192] +# asm 1: vpand 192(r24=reg128#2 +# asm 2: vpand 192(r24=%xmm1 +vpand 192(%rdx),%xmm1,%xmm1 + +# qhasm: r15 ^= r24 +# asm 1: pxor r11=reg128#2 +# asm 2: movdqa r11=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: a11 = mem128[ input_1 + 176 ] +# asm 1: movdqu 176(a11=reg128#15 +# asm 2: movdqu 176(a11=%xmm14 +movdqu 176(%rsi),%xmm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r22 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r23 ^= r +# asm 1: pxor r10=reg128#14 +# asm 2: movdqa r10=%xmm13 +movdqa %xmm13,%xmm13 + +# qhasm: a10 = mem128[ input_1 + 160 ] +# asm 1: movdqu 160(a10=reg128#15 +# asm 2: movdqu 160(a10=%xmm14 +movdqu 160(%rsi),%xmm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r22 ^= r +# asm 1: pxor r9=reg128#13 +# asm 2: movdqa r9=%xmm12 +movdqa %xmm12,%xmm12 + +# qhasm: a9 = mem128[ input_1 + 144 ] +# asm 1: movdqu 144(a9=reg128#15 +# asm 2: movdqu 144(a9=%xmm14 +movdqu 144(%rsi),%xmm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r21 ^= r +# asm 1: pxor r8=reg128#12 +# asm 2: movdqa r8=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: a8 = mem128[ input_1 + 128 ] +# asm 1: movdqu 128(a8=reg128#15 +# asm 2: movdqu 128(a8=%xmm14 +movdqu 128(%rsi),%xmm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r20 ^= r +# asm 1: pxor r7=reg128#11 +# asm 2: movdqa r7=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: a7 = mem128[ input_1 + 112 ] +# asm 1: movdqu 112(a7=reg128#15 +# asm 2: movdqu 112(a7=%xmm14 +movdqu 112(%rsi),%xmm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r6=reg128#10 +# asm 2: movdqa r6=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: a6 = mem128[ input_1 + 96 ] +# asm 1: movdqu 96(a6=reg128#15 +# asm 2: movdqu 96(a6=%xmm14 +movdqu 96(%rsi),%xmm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r5=reg128#9 +# asm 2: movdqa r5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: a5 = mem128[ input_1 + 80 ] +# asm 1: movdqu 80(a5=reg128#15 +# asm 2: movdqu 80(a5=%xmm14 +movdqu 80(%rsi),%xmm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r4=reg128#8 +# asm 2: movdqa r4=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: a4 = mem128[ input_1 + 64 ] +# asm 1: movdqu 64(a4=reg128#15 +# asm 2: movdqu 64(a4=%xmm14 +movdqu 64(%rsi),%xmm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r3=reg128#7 +# asm 2: movdqa r3=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: a3 = mem128[ input_1 + 48 ] +# asm 1: movdqu 48(a3=reg128#15 +# asm 2: movdqu 48(a3=%xmm14 +movdqu 48(%rsi),%xmm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r2=reg128#6 +# asm 2: movdqa r2=%xmm5 +movdqa %xmm5,%xmm5 + +# qhasm: a2 = mem128[ input_1 + 32 ] +# asm 1: movdqu 32(a2=reg128#15 +# asm 2: movdqu 32(a2=%xmm14 +movdqu 32(%rsi),%xmm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r1=reg128#5 +# asm 2: movdqa r1=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: a1 = mem128[ input_1 + 16 ] +# asm 1: movdqu 16(a1=reg128#15 +# asm 2: movdqu 16(a1=%xmm14 +movdqu 16(%rsi),%xmm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r0=reg128#4 +# asm 2: movdqa r0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: a0 = mem128[ input_1 + 0 ] +# asm 1: movdqu 0(a0=reg128#15 +# asm 2: movdqu 0(a0=%xmm14 +movdqu 0(%rsi),%xmm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm0,%xmm14,%xmm0 + +# qhasm: r0 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 16(r=%xmm0 +vpand 16(%rdx),%xmm14,%xmm0 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 32(r=%xmm0 +vpand 32(%rdx),%xmm14,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 48(r=%xmm0 +vpand 48(%rdx),%xmm14,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 64(r=%xmm0 +vpand 64(%rdx),%xmm14,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 80(r=%xmm0 +vpand 80(%rdx),%xmm14,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 96(r=%xmm0 +vpand 96(%rdx),%xmm14,%xmm0 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 112(r=%xmm0 +vpand 112(%rdx),%xmm14,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 128(r=%xmm0 +vpand 128(%rdx),%xmm14,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 144(r=%xmm0 +vpand 144(%rdx),%xmm14,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 160(r=%xmm0 +vpand 160(%rdx),%xmm14,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 176(r=%xmm0 +vpand 176(%rdx),%xmm14,%xmm0 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 192(r=%xmm0 +vpand 192(%rdx),%xmm14,%xmm0 + +# qhasm: r12 ^= r +# asm 1: pxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE8192128_VEC_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece8192128/vec/api.h b/crypto_kem/mceliece8192128/vec/api.h new file mode 100644 index 00000000..620d5d6e --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_API_H +#define PQCLEAN_MCELIECE8192128_VEC_API_H + +#include + +#define PQCLEAN_MCELIECE8192128_VEC_CRYPTO_ALGNAME "Classic McEliece 8192128" +#define PQCLEAN_MCELIECE8192128_VEC_CRYPTO_PUBLICKEYBYTES 1357824 +#define PQCLEAN_MCELIECE8192128_VEC_CRYPTO_SECRETKEYBYTES 14080 +#define PQCLEAN_MCELIECE8192128_VEC_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE8192128_VEC_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE8192128_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE8192128_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE8192128_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/benes.c b/crypto_kem/mceliece8192128/vec/benes.c new file mode 100644 index 00000000..feb579b7 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/benes.c @@ -0,0 +1,147 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" +#include "vec.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE8192128_VEC_benes(vec *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = r[i * 2 + 0]; + r_int_v[1][i] = r[i * 2 + 1]; + } + + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + r[i * 2 + 0] = r_int_v[0][i]; + r[i * 2 + 1] = r_int_v[1][i]; + } +} + diff --git a/crypto_kem/mceliece8192128/vec/benes.h b/crypto_kem/mceliece8192128/vec/benes.h new file mode 100644 index 00000000..a25b3f65 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/benes.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_BENES_H +#define PQCLEAN_MCELIECE8192128_VEC_BENES_H +/* + This file is for Benes network related functions +*/ + +#include + +void PQCLEAN_MCELIECE8192128_VEC_benes(uint64_t * /*r*/, const unsigned char * /*bits*/, int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/bm.c b/crypto_kem/mceliece8192128/vec/bm.c new file mode 100644 index 00000000..1123bc96 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/bm.c @@ -0,0 +1,245 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline void vec_cmov(vec *out, const vec *in, uint16_t mask) { + int i; + + vec m0, m1; + + m0 = PQCLEAN_MCELIECE8192128_VEC_vec_set1_16b(mask); + m1 = ~m0; + + for (i = 0; i < GFBITS; i++) { + out[i] = (in[i] & m0) | (out[i] & m1); + out[i] = (in[i] & m0) | (out[i] & m1); + } +} + +static inline void interleave(vec *in, int idx0, int idx1, const vec *mask, int b) { + int s = 1 << b; + + vec x, y; + + x = (in[idx0] & mask[0]) | ((in[idx1] & mask[0]) << s); + y = ((in[idx0] & mask[1]) >> s) | (in[idx1] & mask[1]); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, const vec *in) { + int i, k; + + vec mask[4][2]; + vec buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = 0; + } + + mask[0][0] = PQCLEAN_MCELIECE8192128_VEC_vec_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE8192128_VEC_vec_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE8192128_VEC_vec_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE8192128_VEC_vec_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE8192128_VEC_vec_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE8192128_VEC_vec_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE8192128_VEC_vec_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE8192128_VEC_vec_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ k * 16 + i ] = (buf[i] >> (k * 16)) & GFMASK; + } + } +} + +static void update(vec in[][GFBITS], const gf e) { + int i; + vec tmp; + + for (i = 0; i < GFBITS; i++) { + tmp = (e >> i) & 1; + + in[0][i] = (in[0][i] >> 1) | (in[1][i] << 63); + in[1][i] = (in[1][i] >> 1) | (tmp << 63); + } +} + +static inline gf vec_reduce(vec in[][GFBITS]) { + int i; + vec tmp; + gf ret = 0; + + for (i = GFBITS - 1; i >= 0; i--) { + tmp = in[0][i] ^ in[1][i]; + + tmp ^= tmp >> 32; + tmp ^= tmp >> 16; + tmp ^= tmp >> 8; + tmp ^= tmp >> 4; + tmp ^= tmp >> 2; + tmp ^= tmp >> 1; + + ret <<= 1; + ret |= tmp & 1; + } + + return ret; +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE8192128_VEC_bm(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec prod[2][GFBITS]; + vec interval[2][GFBITS]; + vec dd[2][GFBITS], bb[2][GFBITS]; + vec B[2][GFBITS], C[2][GFBITS]; + vec B_tmp[2][GFBITS], C_tmp[2][GFBITS]; + vec v[GFBITS]; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[ 64], in[1]); + get_coefs(&coefs[128], in[2]); + get_coefs(&coefs[192], in[3]); + + C[0][0] = 0; + C[1][0] = 0; + B[0][0] = 0; + B[1][0] = one << 63; + + for (i = 1; i < GFBITS; i++) { + C[0][i] = C[1][i] = B[0][i] = B[1][i] = 0; + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[0][i] = interval[1][i] = 0; + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE8192128_VEC_vec_mul(prod[0], C[0], interval[0]); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(prod[1], C[1], interval[1]); + update(interval, coefs[N]); + d = vec_reduce(prod); + + t = PQCLEAN_MCELIECE8192128_VEC_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[0][i] = dd[1][i] = PQCLEAN_MCELIECE8192128_VEC_vec_setbits((d >> i) & 1); + bb[0][i] = bb[1][i] = PQCLEAN_MCELIECE8192128_VEC_vec_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE8192128_VEC_vec_mul(B_tmp[0], dd[0], B[0]); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(B_tmp[1], dd[1], B[1]); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(C_tmp[0], bb[0], C[0]); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(C_tmp[1], bb[1], C[1]); + + vec_cmov(B[0], C[0], mask); + vec_cmov(B[1], C[1], mask); + update(B, c0 & mask); + + for (i = 0; i < GFBITS; i++) { + C[0][i] = B_tmp[0][i] ^ C_tmp[0][i]; + C[1][i] = B_tmp[1][i] ^ C_tmp[1][i]; + } + + c0 = (gf)(t >> 32); + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE8192128_VEC_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + v[i] = PQCLEAN_MCELIECE8192128_VEC_vec_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE8192128_VEC_vec_mul(out[0], C[0], v); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(out[1], C[1], v); +} + diff --git a/crypto_kem/mceliece8192128/vec/bm.h b/crypto_kem/mceliece8192128/vec/bm.h new file mode 100644 index 00000000..b7b3e30a --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/bm.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_BM_H +#define PQCLEAN_MCELIECE8192128_VEC_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE8192128_VEC_bm(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/consts.inc b/crypto_kem/mceliece8192128/vec/consts.inc new file mode 100644 index 00000000..1875ca4d --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/consts.inc @@ -0,0 +1,1920 @@ +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x9999999966666666, + 0x3C3CC3C3C3C33C3C, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xCC33CC3333CC33CC, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0x00FFFF0000FFFF00 +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x6666666699999999, + 0xC3C33C3C3C3CC3C3, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x33CC33CCCC33CC33, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0xFF0000FFFF0000FF +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x9669966969966996, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x6996699696699669, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x6996699696699669, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x9669966969966996, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, diff --git a/crypto_kem/mceliece8192128/vec/controlbits.c b/crypto_kem/mceliece8192128/vec/controlbits.c new file mode 100644 index 00000000..8c888e07 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE8192128_VEC_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE8192128_VEC_sort_63b(n / 2, x); + PQCLEAN_MCELIECE8192128_VEC_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE8192128_VEC_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece8192128/vec/controlbits.h b/crypto_kem/mceliece8192128/vec/controlbits.h new file mode 100644 index 00000000..a3e90410 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_CONTROLBITS_H +#define PQCLEAN_MCELIECE8192128_VEC_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE8192128_VEC_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE8192128_VEC_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/crypto_hash.h b/crypto_kem/mceliece8192128/vec/crypto_hash.h new file mode 100644 index 00000000..8a4ec7b2 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE8192128_VEC_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece8192128/vec/decrypt.c b/crypto_kem/mceliece8192128/vec/decrypt.c new file mode 100644 index 00000000..7488c854 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/decrypt.c @@ -0,0 +1,168 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" +#include "vec.h" + +#include + +static void scaling(vec out[][GFBITS], vec inv[][GFBITS], const unsigned char *sk, const vec *recv) { + int i, j; + + vec irr_int[2][ GFBITS ]; + vec eval[128][ GFBITS ]; + vec tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE8192128_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE8192128_VEC_fft(eval, irr_int); + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE8192128_VEC_vec_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE8192128_VEC_vec_copy(inv[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE8192128_VEC_vec_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128_VEC_vec_inv(tmp, inv[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE8192128_VEC_vec_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128_VEC_vec_copy(inv[0], tmp); + + // + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static void scaling_inv(vec out[][GFBITS], vec inv[][GFBITS], const vec *recv) { + int i, j; + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static void preprocess(vec *recv, const unsigned char *s) { + int i; + + recv[0] = 0; + + for (i = 1; i < 128; i++) { + recv[i] = recv[0]; + } + + for (i = 0; i < SYND_BYTES / 8; i++) { + recv[i] = PQCLEAN_MCELIECE8192128_VEC_load8(s + i * 8); + } +} + +static uint16_t weight(const vec *v) { + uint16_t i, w = 0; + + for (i = 0; i < SYS_N; i++) { + w += (uint16_t)((v[i / 64] >> (i % 64)) & 1); + } + + return w; +} + +static uint16_t synd_cmp(vec s0[][ GFBITS ], vec s1[][ GFBITS ]) { + int i, j; + vec diff = 0; + + for (i = 0; i < 4; i++) { + for (j = 0; j < GFBITS; j++) { + diff |= (s0[i][j] ^ s1[i][j]); + } + } + + return (uint16_t)PQCLEAN_MCELIECE8192128_VEC_vec_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE8192128_VEC_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec inv[ 128 ][ GFBITS ]; + vec scaled[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + + vec error[ 128 ]; + + vec s_priv[ 4 ][ GFBITS ]; + vec s_priv_cmp[ 4 ][ GFBITS ]; + vec locator[2][ GFBITS ]; + + vec recv[ 128 ]; + vec allone; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE8192128_VEC_benes(recv, sk + IRR_BYTES, 1); + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE8192128_VEC_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE8192128_VEC_bm(locator, s_priv); + + PQCLEAN_MCELIECE8192128_VEC_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE8192128_VEC_vec_setbits(1); + + for (i = 0; i < 128; i++) { + error[i] = PQCLEAN_MCELIECE8192128_VEC_vec_or_reduce(eval[i]); + error[i] ^= allone; + } + + check_weight = weight(error) ^ SYS_T; + check_weight -= 1; + check_weight >>= 15; + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE8192128_VEC_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE8192128_VEC_benes(error, sk + IRR_BYTES, 0); + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE8192128_VEC_store8(e + i * 8, error[i]); + } + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece8192128/vec/decrypt.h b/crypto_kem/mceliece8192128/vec/decrypt.h new file mode 100644 index 00000000..11bbe4ad --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_DECRYPT_H +#define PQCLEAN_MCELIECE8192128_VEC_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE8192128_VEC_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/encrypt.c b/crypto_kem/mceliece8192128/vec/encrypt.c new file mode 100644 index 00000000..9ebc6009 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/encrypt.c @@ -0,0 +1,116 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq; + + uint16_t ind[ SYS_T ]; + uint8_t *ind8 = (uint8_t *)ind; + uint64_t e_int[ SYS_N / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes(ind8, sizeof(ind)); + for (i = 0; i < sizeof(ind); i += 2) { + ind[i / 2] = (uint16_t)ind8[i + 1] << 8 | ind8[i]; + } + + for (i = 0; i < SYS_T; i++) { + ind[i] &= GFMASK; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind[j] & 63); + } + + for (i = 0; i < SYS_N / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < SYS_N / 64; i++) { + PQCLEAN_MCELIECE8192128_VEC_store8(e + i * 8, e_int[i]); + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + uint64_t b; + + const uint8_t *e_ptr8 = e + SYND_BYTES; + const uint8_t *pk_ptr8; + + int i, j; + + // + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = e[i]; + } + + for (i = 0; i < PK_NROWS; i++) { + pk_ptr8 = pk + PK_ROW_BYTES * i; + + b = 0; + for (j = 0; j < PK_NCOLS / 64; j++) { + b ^= PQCLEAN_MCELIECE8192128_VEC_load8(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE8192128_VEC_load8(e_ptr8 + 8 * j); + } + + b ^= b >> 32; + b ^= b >> 16; + b ^= b >> 8; + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] ^= (b << (i % 8)); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE8192128_VEC_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece8192128/vec/encrypt.h b/crypto_kem/mceliece8192128/vec/encrypt.h new file mode 100644 index 00000000..3ee62ce6 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_ENCRYPT_H +#define PQCLEAN_MCELIECE8192128_VEC_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE8192128_VEC_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/fft.c b/crypto_kem/mceliece8192128/vec/fft.c new file mode 100644 index 00000000..604fb7ef --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/fft.c @@ -0,0 +1,274 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec in[][GFBITS]) { + int i, j, k; + + const vec mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const vec s[5][2][GFBITS] = { +#include "scalars_2x.inc" + }; + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[1][i] >> 32; + in[0][i] ^= in[1][i] << 32; + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[0][i] ^= (in[0][i] & mask[k][0]) >> (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) >> (1 << k); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE8192128_VEC_vec_mul(in[0], in[0], s[j][0]); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(in[1], in[1], s[j][1]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[8][ GFBITS ]; + vec buf[128]; + + uint64_t consts_ptr = 2; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec powers[ 128 ][ GFBITS ] = { +#include "powers.inc" + }; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[7] = {2522, 7827, 7801, 8035, 6897, 8167, 3476}; + + // + + for (i = 0; i < 7; i++) { + for (j = 0; j < GFBITS; j++) { + pre[i][j] = (beta[i] >> j) & 1; + pre[i][j] = -pre[i][j]; + } + + PQCLEAN_MCELIECE8192128_VEC_vec_mul(pre[i], in[1], pre[i]); + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = in[0][i]; + + buf[1] = buf[0] ^ pre[0][i]; + buf[32] = in[0][i] ^ pre[5][i]; + buf[3] = buf[1] ^ pre[1][i]; + buf[96] = buf[32] ^ pre[6][i]; + buf[97] = buf[96] ^ pre[0][i]; + buf[2] = in[0][i] ^ pre[1][i]; + buf[99] = buf[97] ^ pre[1][i]; + buf[6] = buf[2] ^ pre[2][i]; + buf[98] = buf[99] ^ pre[0][i]; + buf[7] = buf[6] ^ pre[0][i]; + buf[102] = buf[98] ^ pre[2][i]; + buf[5] = buf[7] ^ pre[1][i]; + buf[103] = buf[102] ^ pre[0][i]; + buf[101] = buf[103] ^ pre[1][i]; + buf[4] = in[0][i] ^ pre[2][i]; + buf[100] = buf[101] ^ pre[0][i]; + buf[12] = buf[4] ^ pre[3][i]; + buf[108] = buf[100] ^ pre[3][i]; + buf[13] = buf[12] ^ pre[0][i]; + buf[109] = buf[108] ^ pre[0][i]; + buf[15] = buf[13] ^ pre[1][i]; + buf[111] = buf[109] ^ pre[1][i]; + buf[14] = buf[15] ^ pre[0][i]; + buf[110] = buf[111] ^ pre[0][i]; + buf[10] = buf[14] ^ pre[2][i]; + buf[106] = buf[110] ^ pre[2][i]; + buf[11] = buf[10] ^ pre[0][i]; + buf[107] = buf[106] ^ pre[0][i]; + buf[9] = buf[11] ^ pre[1][i]; + buf[105] = buf[107] ^ pre[1][i]; + buf[104] = buf[105] ^ pre[0][i]; + buf[8] = in[0][i] ^ pre[3][i]; + buf[120] = buf[104] ^ pre[4][i]; + buf[24] = buf[8] ^ pre[4][i]; + buf[121] = buf[120] ^ pre[0][i]; + buf[25] = buf[24] ^ pre[0][i]; + buf[123] = buf[121] ^ pre[1][i]; + buf[27] = buf[25] ^ pre[1][i]; + buf[122] = buf[123] ^ pre[0][i]; + buf[26] = buf[27] ^ pre[0][i]; + buf[126] = buf[122] ^ pre[2][i]; + buf[30] = buf[26] ^ pre[2][i]; + buf[127] = buf[126] ^ pre[0][i]; + buf[31] = buf[30] ^ pre[0][i]; + buf[125] = buf[127] ^ pre[1][i]; + buf[29] = buf[31] ^ pre[1][i]; + buf[124] = buf[125] ^ pre[0][i]; + buf[28] = buf[29] ^ pre[0][i]; + buf[116] = buf[124] ^ pre[3][i]; + buf[20] = buf[28] ^ pre[3][i]; + buf[117] = buf[116] ^ pre[0][i]; + buf[21] = buf[20] ^ pre[0][i]; + buf[119] = buf[117] ^ pre[1][i]; + buf[23] = buf[21] ^ pre[1][i]; + buf[118] = buf[119] ^ pre[0][i]; + buf[22] = buf[23] ^ pre[0][i]; + buf[114] = buf[118] ^ pre[2][i]; + buf[18] = buf[22] ^ pre[2][i]; + buf[115] = buf[114] ^ pre[0][i]; + buf[19] = buf[18] ^ pre[0][i]; + buf[113] = buf[115] ^ pre[1][i]; + buf[17] = buf[19] ^ pre[1][i]; + buf[112] = buf[113] ^ pre[0][i]; + buf[80] = buf[112] ^ pre[5][i]; + buf[16] = in[0][i] ^ pre[4][i]; + buf[81] = buf[80] ^ pre[0][i]; + buf[48] = buf[16] ^ pre[5][i]; + buf[83] = buf[81] ^ pre[1][i]; + buf[49] = buf[48] ^ pre[0][i]; + buf[82] = buf[83] ^ pre[0][i]; + buf[51] = buf[49] ^ pre[1][i]; + buf[86] = buf[82] ^ pre[2][i]; + buf[50] = buf[51] ^ pre[0][i]; + buf[87] = buf[86] ^ pre[0][i]; + buf[54] = buf[50] ^ pre[2][i]; + buf[85] = buf[87] ^ pre[1][i]; + buf[55] = buf[54] ^ pre[0][i]; + buf[84] = buf[85] ^ pre[0][i]; + buf[53] = buf[55] ^ pre[1][i]; + buf[92] = buf[84] ^ pre[3][i]; + buf[52] = buf[53] ^ pre[0][i]; + buf[93] = buf[92] ^ pre[0][i]; + buf[60] = buf[52] ^ pre[3][i]; + buf[95] = buf[93] ^ pre[1][i]; + buf[61] = buf[60] ^ pre[0][i]; + buf[94] = buf[95] ^ pre[0][i]; + buf[63] = buf[61] ^ pre[1][i]; + buf[90] = buf[94] ^ pre[2][i]; + buf[62] = buf[63] ^ pre[0][i]; + buf[91] = buf[90] ^ pre[0][i]; + buf[58] = buf[62] ^ pre[2][i]; + buf[89] = buf[91] ^ pre[1][i]; + buf[59] = buf[58] ^ pre[0][i]; + buf[88] = buf[89] ^ pre[0][i]; + buf[57] = buf[59] ^ pre[1][i]; + buf[72] = buf[88] ^ pre[4][i]; + buf[56] = buf[57] ^ pre[0][i]; + buf[73] = buf[72] ^ pre[0][i]; + buf[40] = buf[56] ^ pre[4][i]; + buf[75] = buf[73] ^ pre[1][i]; + buf[41] = buf[40] ^ pre[0][i]; + buf[74] = buf[75] ^ pre[0][i]; + buf[43] = buf[41] ^ pre[1][i]; + buf[78] = buf[74] ^ pre[2][i]; + buf[42] = buf[43] ^ pre[0][i]; + buf[79] = buf[78] ^ pre[0][i]; + buf[46] = buf[42] ^ pre[2][i]; + buf[77] = buf[79] ^ pre[1][i]; + buf[47] = buf[46] ^ pre[0][i]; + buf[76] = buf[77] ^ pre[0][i]; + buf[45] = buf[47] ^ pre[1][i]; + buf[68] = buf[76] ^ pre[3][i]; + buf[44] = buf[45] ^ pre[0][i]; + buf[69] = buf[68] ^ pre[0][i]; + buf[36] = buf[44] ^ pre[3][i]; + buf[71] = buf[69] ^ pre[1][i]; + buf[37] = buf[36] ^ pre[0][i]; + buf[70] = buf[71] ^ pre[0][i]; + buf[39] = buf[37] ^ pre[1][i]; + buf[66] = buf[70] ^ pre[2][i]; + buf[38] = buf[39] ^ pre[0][i]; + buf[67] = buf[66] ^ pre[0][i]; + buf[34] = buf[38] ^ pre[2][i]; + buf[65] = buf[67] ^ pre[1][i]; + buf[35] = buf[34] ^ pre[0][i]; + buf[33] = buf[35] ^ pre[1][i]; + buf[64] = in[0][i] ^ pre[6][i]; + + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(buf + 0, buf + 0); + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(buf + 64, buf + 64); + + for (j = 0; j < 128; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 1; i <= 6; i++) { + s = 1 << i; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE8192128_VEC_vec_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += (uint64_t)1 << i; + } + + // adding the part contributed by x^128 + + for (i = 0; i < 128; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] ^= powers[i][b]; + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE8192128_VEC_fft(vec out[][GFBITS], vec in[][GFBITS]) { + radix_conversions(in); + butterflies(out, in); +} diff --git a/crypto_kem/mceliece8192128/vec/fft.h b/crypto_kem/mceliece8192128/vec/fft.h new file mode 100644 index 00000000..9a46454f --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_FFT_H +#define PQCLEAN_MCELIECE8192128_VEC_FFT_H +/* + This file is for the Gao-Mateer FFT + see http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE8192128_VEC_fft(vec /*out*/[][GFBITS], vec /*in*/[][GFBITS]); + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/fft_tr.c b/crypto_kem/mceliece8192128/vec/fft_tr.c new file mode 100644 index 00000000..81d28992 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/fft_tr.c @@ -0,0 +1,289 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec in[][ GFBITS ]) { + int i, j, k; + + const vec mask[6][2] = { + {0x2222222222222222, 0x4444444444444444}, + {0x0C0C0C0C0C0C0C0C, 0x3030303030303030}, + {0x00F000F000F000F0, 0x0F000F000F000F00}, + {0x0000FF000000FF00, 0x00FF000000FF0000}, + {0x00000000FFFF0000, 0x0000FFFF00000000}, + {0xFFFFFFFF00000000, 0x00000000FFFFFFFF} + }; + + const vec s[6][4][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE8192128_VEC_vec_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE8192128_VEC_vec_mul(in[1], in[1], s[j][1]); // scaling + PQCLEAN_MCELIECE8192128_VEC_vec_mul(in[2], in[2], s[j][2]); // scaling + PQCLEAN_MCELIECE8192128_VEC_vec_mul(in[3], in[3], s[j][3]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + in[0][i] ^= (in[0][i] & mask[k][0]) << (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][0]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][1]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][0]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][1]) << (1 << k); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[0][i] >> 32; + in[1][i] ^= in[1][i] << 32; + + in[3][i] ^= in[2][i] >> 32; + in[3][i] ^= in[3][i] << 32; + } + } + + for (i = 0; i < GFBITS; i++) { + in[3][i] ^= in[2][i] ^= in[1][i]; + } + } +} + +static void butterflies_tr(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[6][2][ GFBITS ]; + vec buf[2][64]; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 128; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 6; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + + PQCLEAN_MCELIECE8192128_VEC_vec_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tmp[b]; + } + } + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 128; k++) { + (&buf[0][0])[ k ] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(buf[0], buf[0]); + PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(buf[1], buf[1]); + + for (k = 0; k < 2; k++) { + pre[0][k][i] = buf[k][32]; + buf[k][33] ^= buf[k][32]; + pre[1][k][i] = buf[k][33]; + buf[k][35] ^= buf[k][33]; + pre[0][k][i] ^= buf[k][35]; + buf[k][34] ^= buf[k][35]; + pre[2][k][i] = buf[k][34]; + buf[k][38] ^= buf[k][34]; + pre[0][k][i] ^= buf[k][38]; + buf[k][39] ^= buf[k][38]; + pre[1][k][i] ^= buf[k][39]; + buf[k][37] ^= buf[k][39]; + pre[0][k][i] ^= buf[k][37]; + buf[k][36] ^= buf[k][37]; + pre[3][k][i] = buf[k][36]; + buf[k][44] ^= buf[k][36]; + pre[0][k][i] ^= buf[k][44]; + buf[k][45] ^= buf[k][44]; + pre[1][k][i] ^= buf[k][45]; + buf[k][47] ^= buf[k][45]; + pre[0][k][i] ^= buf[k][47]; + buf[k][46] ^= buf[k][47]; + pre[2][k][i] ^= buf[k][46]; + buf[k][42] ^= buf[k][46]; + pre[0][k][i] ^= buf[k][42]; + buf[k][43] ^= buf[k][42]; + pre[1][k][i] ^= buf[k][43]; + buf[k][41] ^= buf[k][43]; + pre[0][k][i] ^= buf[k][41]; + buf[k][40] ^= buf[k][41]; + pre[4][k][i] = buf[k][40]; + buf[k][56] ^= buf[k][40]; + pre[0][k][i] ^= buf[k][56]; + buf[k][57] ^= buf[k][56]; + pre[1][k][i] ^= buf[k][57]; + buf[k][59] ^= buf[k][57]; + pre[0][k][i] ^= buf[k][59]; + buf[k][58] ^= buf[k][59]; + pre[2][k][i] ^= buf[k][58]; + buf[k][62] ^= buf[k][58]; + pre[0][k][i] ^= buf[k][62]; + buf[k][63] ^= buf[k][62]; + pre[1][k][i] ^= buf[k][63]; + buf[k][61] ^= buf[k][63]; + pre[0][k][i] ^= buf[k][61]; + buf[k][60] ^= buf[k][61]; + pre[3][k][i] ^= buf[k][60]; + buf[k][52] ^= buf[k][60]; + pre[0][k][i] ^= buf[k][52]; + buf[k][53] ^= buf[k][52]; + pre[1][k][i] ^= buf[k][53]; + buf[k][55] ^= buf[k][53]; + pre[0][k][i] ^= buf[k][55]; + buf[k][54] ^= buf[k][55]; + pre[2][k][i] ^= buf[k][54]; + buf[k][50] ^= buf[k][54]; + pre[0][k][i] ^= buf[k][50]; + buf[k][51] ^= buf[k][50]; + pre[1][k][i] ^= buf[k][51]; + buf[k][49] ^= buf[k][51]; + pre[0][k][i] ^= buf[k][49]; + buf[k][48] ^= buf[k][49]; + pre[5][k][i] = buf[k][48]; + buf[k][16] ^= buf[k][48]; + pre[0][k][i] ^= buf[k][16]; + buf[k][17] ^= buf[k][16]; + pre[1][k][i] ^= buf[k][17]; + buf[k][19] ^= buf[k][17]; + pre[0][k][i] ^= buf[k][19]; + buf[k][18] ^= buf[k][19]; + pre[2][k][i] ^= buf[k][18]; + buf[k][22] ^= buf[k][18]; + pre[0][k][i] ^= buf[k][22]; + buf[k][23] ^= buf[k][22]; + pre[1][k][i] ^= buf[k][23]; + buf[k][21] ^= buf[k][23]; + pre[0][k][i] ^= buf[k][21]; + buf[k][20] ^= buf[k][21]; + pre[3][k][i] ^= buf[k][20]; + buf[k][28] ^= buf[k][20]; + pre[0][k][i] ^= buf[k][28]; + buf[k][29] ^= buf[k][28]; + pre[1][k][i] ^= buf[k][29]; + buf[k][31] ^= buf[k][29]; + pre[0][k][i] ^= buf[k][31]; + buf[k][30] ^= buf[k][31]; + pre[2][k][i] ^= buf[k][30]; + buf[k][26] ^= buf[k][30]; + pre[0][k][i] ^= buf[k][26]; + buf[k][27] ^= buf[k][26]; + pre[1][k][i] ^= buf[k][27]; + buf[k][25] ^= buf[k][27]; + pre[0][k][i] ^= buf[k][25]; + buf[k][24] ^= buf[k][25]; + pre[4][k][i] ^= buf[k][24]; + buf[k][8] ^= buf[k][24]; + pre[0][k][i] ^= buf[k][8]; + buf[k][9] ^= buf[k][8]; + pre[1][k][i] ^= buf[k][9]; + buf[k][11] ^= buf[k][9]; + pre[0][k][i] ^= buf[k][11]; + buf[k][10] ^= buf[k][11]; + pre[2][k][i] ^= buf[k][10]; + buf[k][14] ^= buf[k][10]; + pre[0][k][i] ^= buf[k][14]; + buf[k][15] ^= buf[k][14]; + pre[1][k][i] ^= buf[k][15]; + buf[k][13] ^= buf[k][15]; + pre[0][k][i] ^= buf[k][13]; + buf[k][12] ^= buf[k][13]; + pre[3][k][i] ^= buf[k][12]; + buf[k][4] ^= buf[k][12]; + pre[0][k][i] ^= buf[k][4]; + buf[k][5] ^= buf[k][4]; + pre[1][k][i] ^= buf[k][5]; + buf[k][7] ^= buf[k][5]; + pre[0][k][i] ^= buf[k][7]; + buf[k][6] ^= buf[k][7]; + pre[2][k][i] ^= buf[k][6]; + buf[k][2] ^= buf[k][6]; + pre[0][k][i] ^= buf[k][2]; + buf[k][3] ^= buf[k][2]; + pre[1][k][i] ^= buf[k][3]; + buf[k][1] ^= buf[k][3]; + + pre[0][k][i] ^= buf[k][1]; + out[k][i] = buf[k][0] ^ buf[k][1]; + } + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128_VEC_vec_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128_VEC_vec_mul(out[2], pre[0][0], tmp); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(out[3], pre[0][1], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128_VEC_vec_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128_VEC_vec_mul(pre[i][0], pre[i][0], tmp); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(pre[i][1], pre[i][1], tmp); + + for (b = 0; b < GFBITS; b++) { + out[2][b] ^= pre[i][0][b]; + out[3][b] ^= pre[i][1][b]; + } + } + +} + +void PQCLEAN_MCELIECE8192128_VEC_fft_tr(vec out[][GFBITS], vec in[][ GFBITS ]) { + butterflies_tr(out, in); + + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece8192128/vec/fft_tr.h b/crypto_kem/mceliece8192128/vec/fft_tr.h new file mode 100644 index 00000000..13fe4704 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_FFT_TR_H +#define PQCLEAN_MCELIECE8192128_VEC_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE8192128_VEC_fft_tr(vec out[][GFBITS], vec in[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/gf.c b/crypto_kem/mceliece8192128/vec/gf.c new file mode 100644 index 00000000..2c34c1ae --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128_VEC_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE8192128_VEC_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE8192128_VEC_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE8192128_VEC_gf_inv(gf in) { + return PQCLEAN_MCELIECE8192128_VEC_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE8192128_VEC_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE8192128_VEC_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE8192128_VEC_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE8192128_VEC_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE8192128_VEC_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE8192128_VEC_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece8192128/vec/gf.h b/crypto_kem/mceliece8192128/vec/gf.h new file mode 100644 index 00000000..cbd30a9c --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/gf.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_GF_H +#define PQCLEAN_MCELIECE8192128_VEC_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE8192128_VEC_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE8192128_VEC_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE8192128_VEC_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE8192128_VEC_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE8192128_VEC_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128_VEC_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/operations.c b/crypto_kem/mceliece8192128/vec/operations.c new file mode 100644 index 00000000..81f4e05d --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE8192128_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE8192128_VEC_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE8192128_VEC_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE8192128_VEC_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE8192128_VEC_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE8192128_VEC_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE8192128_VEC_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE8192128_VEC_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE8192128_VEC_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE8192128_VEC_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE8192128_VEC_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128/vec/params.h b/crypto_kem/mceliece8192128/vec/params.h new file mode 100644 index 00000000..5ddd48ef --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_PARAMS_H +#define PQCLEAN_MCELIECE8192128_VEC_PARAMS_H + +#define GFBITS 13 +#define SYS_N 8192 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/pk_gen.c b/crypto_kem/mceliece8192128/vec/pk_gen.c new file mode 100644 index 00000000..93ca4234 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/pk_gen.c @@ -0,0 +1,248 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec in[][GFBITS]) { + int i, j, r; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 0; r < 64; r++) { + out[i * 64 + r] <<= 1; + out[i * 64 + r] |= (in[i][j] >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec out0[][GFBITS], vec out1[][GFBITS], const uint64_t *in) { + int i, j, r; + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out1[i][j] <<= 1; + out1[i][j] |= (in[i * 64 + r] >> (j + GFBITS)) & 1; + } + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out0[i][GFBITS - 1 - j] <<= 1; + out0[i][GFBITS - 1 - j] |= (in[i * 64 + r] >> j) & 1; + } + } + } +} + +int PQCLEAN_MCELIECE8192128_VEC_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int i, j, k; + int row, c, d; + + uint64_t mat[ GFBITS * SYS_T ][ 128 ]; + uint64_t ops[ GFBITS * SYS_T ][ GFBITS * SYS_T / 64 ]; + + uint64_t mask; + + vec irr_int[2][ GFBITS ]; + + vec consts[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + vec prod[ 128 ][ GFBITS ]; + vec tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ (SYS_N - GFBITS * SYS_T) / 64 ]; + + // compute the inverses + + PQCLEAN_MCELIECE8192128_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE8192128_VEC_fft(eval, irr_int); + + PQCLEAN_MCELIECE8192128_VEC_vec_copy(prod[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE8192128_VEC_vec_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128_VEC_vec_inv(tmp, prod[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE8192128_VEC_vec_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128_VEC_vec_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE8192128_VEC_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < (GFBITS * SYS_T + 63) / 64; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < (GFBITS * SYS_T + 63) / 64; j++) { + PQCLEAN_MCELIECE8192128_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + ops[ row ][ c ] = 0; + } + } + } + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + ops[ row ][ i ] = 1; + ops[ row ][ i ] <<= j; + } + } + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (i = (GFBITS * SYS_T) / 64 - 1; i >= 0; i--) { + for (j = 63; j >= 0; j--) { + row = i * 64 + j; + + for (k = 0; k < row; k++) { + { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + } + } + + // apply the linear map to the non-systematic part + + for (j = (GFBITS * SYS_T + 63) / 64; j < 128; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = (GFBITS * SYS_T + 63) / 64; j < 128; j++) { + PQCLEAN_MCELIECE8192128_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + for (k = 0; k < (SYS_N - GFBITS * SYS_T) / 64; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + for (d = 0; d < 64; d++) { + mask = ops[ row ][ c ] >> d; + mask &= 1; + mask = -mask; + + for (k = 0; k < (SYS_N - GFBITS * SYS_T) / 64; k++) { + one_row[ k ] ^= mat[ c * 64 + d ][ k + (GFBITS * SYS_T) / 64 ] & mask; + } + } + } + + for (k = 0; k < (SYS_N - GFBITS * SYS_T) / 64; k++) { + PQCLEAN_MCELIECE8192128_VEC_store8(pk, one_row[ k ]); + pk += 8; + } + } + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece8192128/vec/pk_gen.h b/crypto_kem/mceliece8192128/vec/pk_gen.h new file mode 100644 index 00000000..81f82ea9 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_PK_GEN_H +#define PQCLEAN_MCELIECE8192128_VEC_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE8192128_VEC_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/powers.inc b/crypto_kem/mceliece8192128/vec/powers.inc new file mode 100644 index 00000000..a9bd6179 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/powers.inc @@ -0,0 +1,1920 @@ +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +} diff --git a/crypto_kem/mceliece8192128/vec/scalars_2x.inc b/crypto_kem/mceliece8192128/vec/scalars_2x.inc new file mode 100644 index 00000000..a0abb162 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/scalars_2x.inc @@ -0,0 +1,150 @@ +{{ + 0X3C3CF30C0000C003, + 0X0CCCC3F333C0000C, + 0X03C33F33FCC0C03C, + 0X0003000F3C03C0C0, + 0XF33FF33030CF03F0, + 0X0CF0303300F0CCC0, + 0XFF3F0C0CC0FF3CC0, + 0XCF3CF0FF003FC000, + 0XC00FF3CF0303F300, + 0X3CCC0CC00CF0CC00, + 0XF30FFC3C3FCCFC00, + 0X3F0FC3F0CCF0C000, + 0X3000FF33CCF0F000 +}, +{ + 0X0C0F0FCF0F0CF330, + 0XF0000FC33C3CCF3C, + 0X3C0F3F00C3C300FC, + 0X3C33CCC0F0F3CC30, + 0XC0CFFFFFCCCC30CC, + 0X3FC3F3CCFFFC033F, + 0XFC3030CCCCC0CFCF, + 0X0FCF0C00CCF333C3, + 0XCFFCF33000CFF030, + 0X00CFFCC330F30FCC, + 0X3CCC3FCCC0F3FFF3, + 0XF00F0C3FC003C0FF, + 0X330CCFCC03C0FC33 +}}, +{{ + 0X0F0F0FF0F000000F, + 0X00FFFFFFFF0000F0, + 0XFFFF00FF00000F00, + 0XFFF000F00F0FF000, + 0XFFF0000F0FF000F0, + 0X00FF000FFF000000, + 0XFF0F0FFF0F0FF000, + 0X0FFF0000000F0000, + 0X00F000F0FFF00F00, + 0X00F00FF00F00F000, + 0XFFF000F000F00000, + 0X00F00F000FF00000, + 0X0000FF0F0000F000 +}, +{ + 0XF0FFFFFFF0F00F00, + 0X00FFF0FFFF0000FF, + 0X00FF00000F0F0FFF, + 0XF000F0000F00FF0F, + 0XFF000000FFF00000, + 0XF0FF000FF00F0FF0, + 0X0F0F0F00FF000F0F, + 0X0F0F00F0F0F0F000, + 0X00F00F00F00F000F, + 0X00F0F0F00000FFF0, + 0XFFFFFF0FF00F0FFF, + 0X0F0FFFF00FFFFFFF, + 0XFFFF0F0FFF0FFF00 +}}, +{{ + 0X00FF0000000000FF, + 0XFFFFFFFFFF00FF00, + 0XFF0000FF00FF0000, + 0XFFFF000000FF0000, + 0XFF00000000FF0000, + 0X00FFFFFFFF000000, + 0XFF0000FFFFFF0000, + 0XFF00FF00FFFF0000, + 0X00FFFFFFFF00FF00, + 0XFFFF000000000000, + 0X00FF0000FF000000, + 0XFF00FF00FF000000, + 0X00FF00FFFF000000 +}, +{ + 0X00FF00FF00FF0000, + 0XFF00FFFF000000FF, + 0X0000FFFF000000FF, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF00FF, + 0X0000FFFF00FFFF00, + 0XFF00FF0000FFFF00, + 0X00000000FFFFFFFF, + 0X0000FF0000000000, + 0XFF00FFFF00FFFF00, + 0X00FFFF00000000FF, + 0X0000FF00FF00FFFF, + 0XFF0000FFFFFF0000 +}}, +{{ + 0X000000000000FFFF, + 0XFFFFFFFFFFFF0000, + 0X0000000000000000, + 0XFFFF0000FFFF0000, + 0XFFFFFFFFFFFF0000, + 0X0000FFFF00000000, + 0X0000FFFFFFFF0000, + 0XFFFF0000FFFF0000, + 0X0000FFFF00000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000FFFF00000000, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0X0000FFFF00000000, + 0XFFFF0000FFFF0000, + 0X0000FFFFFFFF0000, + 0X0000FFFF0000FFFF, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFF0000, + 0XFFFF0000FFFFFFFF, + 0XFFFF0000FFFFFFFF, + 0X0000000000000000 +}}, +{{ + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000 +}} diff --git a/crypto_kem/mceliece8192128/vec/scalars_4x.inc b/crypto_kem/mceliece8192128/vec/scalars_4x.inc new file mode 100644 index 00000000..cbaccec7 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/scalars_4x.inc @@ -0,0 +1,360 @@ +{{ + 0x3C3CF30C0000C003, + 0x0CCCC3F333C0000C, + 0x03C33F33FCC0C03C, + 0x0003000F3C03C0C0, + 0xF33FF33030CF03F0, + 0x0CF0303300F0CCC0, + 0xFF3F0C0CC0FF3CC0, + 0xCF3CF0FF003FC000, + 0xC00FF3CF0303F300, + 0x3CCC0CC00CF0CC00, + 0xF30FFC3C3FCCFC00, + 0x3F0FC3F0CCF0C000, + 0x3000FF33CCF0F000 +}, +{ + 0x0C0F0FCF0F0CF330, + 0xF0000FC33C3CCF3C, + 0x3C0F3F00C3C300FC, + 0x3C33CCC0F0F3CC30, + 0xC0CFFFFFCCCC30CC, + 0x3FC3F3CCFFFC033F, + 0xFC3030CCCCC0CFCF, + 0x0FCF0C00CCF333C3, + 0xCFFCF33000CFF030, + 0x00CFFCC330F30FCC, + 0x3CCC3FCCC0F3FFF3, + 0xF00F0C3FC003C0FF, + 0x330CCFCC03C0FC33 +}, +{ + 0xF0F30C33CF03F03F, + 0x00F30FC00C3300FF, + 0xF3CC3CF3F3FCF33F, + 0x3C0FC0FC303C3F3C, + 0xFC30CF303F3FF00F, + 0x33300C0CC3300CF3, + 0x3C030CF3F03FF3F3, + 0x3CCC03FCCC3FFC03, + 0x033C3C3CF0003FC3, + 0xFFC0FF00F0FF0F03, + 0xF3F30CF003FCC303, + 0x30CFCFC3CC0F3000, + 0x0CF30CCF3FCFCC0F +}, +{ + 0x3F30CC0C000F3FCC, + 0xFC3CF030FC3FFF03, + 0x33FFFCFF0CCF3CC3, + 0x003CFF33C3CC30CF, + 0xCFF3CF33C00F3003, + 0x00F3CC0CF3003CCF, + 0x3C000CFCCC3C3333, + 0xF3CF03C0FCF03FF0, + 0x3F3C3CF0C330330C, + 0x33CCFCC0FF0033F0, + 0x33C300C0F0C003F3, + 0x003FF0003F00C00C, + 0xCFF3C3033F030FFF +}}, +{{ + 0x0F0F0FF0F000000F, + 0x00FFFFFFFF0000F0, + 0xFFFF00FF00000F00, + 0xFFF000F00F0FF000, + 0xFFF0000F0FF000F0, + 0x00FF000FFF000000, + 0xFF0F0FFF0F0FF000, + 0x0FFF0000000F0000, + 0x00F000F0FFF00F00, + 0x00F00FF00F00F000, + 0xFFF000F000F00000, + 0x00F00F000FF00000, + 0x0000FF0F0000F000 +}, +{ + 0xF0FFFFFFF0F00F00, + 0x00FFF0FFFF0000FF, + 0x00FF00000F0F0FFF, + 0xF000F0000F00FF0F, + 0xFF000000FFF00000, + 0xF0FF000FF00F0FF0, + 0x0F0F0F00FF000F0F, + 0x0F0F00F0F0F0F000, + 0x00F00F00F00F000F, + 0x00F0F0F00000FFF0, + 0xFFFFFF0FF00F0FFF, + 0x0F0FFFF00FFFFFFF, + 0xFFFF0F0FFF0FFF00 +}, +{ + 0x0F0F00FF0FF0FFFF, + 0xF000F0F00F00FF0F, + 0x000FFFF0FFF0FF0F, + 0x00F00FFF00000FF0, + 0xFFFFF0000FFFF00F, + 0xFFF0FFF0000FFFF0, + 0xF0F0F0000F0F0F00, + 0x00F000F0F00FFF00, + 0xF0FF0F0FFF00F0FF, + 0xF0FF0FFFF0F0F0FF, + 0x00FFFFFFFFFFFFF0, + 0x00FFF0F0FF000F0F, + 0x000FFFF0000FFF00 +}, +{ + 0xFF0F0F00F000F0FF, + 0x0FFFFFFFFF00000F, + 0xF0FFFF000F00F0FF, + 0x0F0000F00FFF0FFF, + 0x0F0F0F00FF0F000F, + 0x000F0F0FFFF0F000, + 0xF0FFFF0F00F0FF0F, + 0x0F0F000F0F00F0FF, + 0x0000F0FF00FF0F0F, + 0x00FFFF0FF0FFF0F0, + 0x0000000F00F0FFF0, + 0xF0F00000FF00F0F0, + 0x0F0F0FFFFFFFFFFF +}}, +{{ + 0x00FF0000000000FF, + 0xFFFFFFFFFF00FF00, + 0xFF0000FF00FF0000, + 0xFFFF000000FF0000, + 0xFF00000000FF0000, + 0x00FFFFFFFF000000, + 0xFF0000FFFFFF0000, + 0xFF00FF00FFFF0000, + 0x00FFFFFFFF00FF00, + 0xFFFF000000000000, + 0x00FF0000FF000000, + 0xFF00FF00FF000000, + 0x00FF00FFFF000000 +}, +{ + 0x00FF00FF00FF0000, + 0xFF00FFFF000000FF, + 0x0000FFFF000000FF, + 0x00FFFF00FF000000, + 0xFFFFFF0000FF00FF, + 0x0000FFFF00FFFF00, + 0xFF00FF0000FFFF00, + 0x00000000FFFFFFFF, + 0x0000FF0000000000, + 0xFF00FFFF00FFFF00, + 0x00FFFF00000000FF, + 0x0000FF00FF00FFFF, + 0xFF0000FFFFFF0000 +}, +{ + 0xFFFF00FF00FF00FF, + 0x00FFFF000000FF00, + 0xFFFF00FFFFFFFF00, + 0x0000FFFF00FFFFFF, + 0x00FF0000FF0000FF, + 0xFFFF0000FF00FFFF, + 0xFF000000FFFFFF00, + 0x000000000000FFFF, + 0xFF00FF00FFFF0000, + 0xFFFF00FFFF00FFFF, + 0xFFFFFFFFFF00FF00, + 0xFFFF00FFFF0000FF, + 0x0000FF00000000FF +}, +{ + 0xFF0000FFFFFF00FF, + 0xFFFF0000FFFFFFFF, + 0xFFFF000000FFFFFF, + 0x00FFFF00FF0000FF, + 0xFFFFFF00FFFFFF00, + 0x00FFFF00FFFF00FF, + 0x0000FFFF00FF0000, + 0x000000FFFF000000, + 0xFF00FF0000FF00FF, + 0x00FF0000000000FF, + 0xFF00FFFF00FF00FF, + 0xFFFFFFFFFFFFFFFF, + 0x0000FF000000FFFF +}}, +{{ + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0x0000000000000000, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFF0000, + 0x0000FFFF00000000, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFF0000, + 0x0000FFFF00000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000FFFF00000000, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000FFFF00000000, + 0xFFFF0000FFFF0000, + 0x0000FFFFFFFF0000, + 0x0000FFFF0000FFFF, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFF0000, + 0xFFFF0000FFFFFFFF, + 0xFFFF0000FFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF000000000000, + 0x0000FFFF00000000, + 0x00000000FFFF0000, + 0x0000FFFFFFFFFFFF, + 0x0000FFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0x000000000000FFFF, + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0xFFFFFFFF0000FFFF, + 0xFFFF0000FFFFFFFF +}, +{ + 0x0000FFFFFFFFFFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFFFFFF, + 0x00000000FFFF0000, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFF00000000, + 0xFFFFFFFF00000000, + 0x0000FFFFFFFF0000, + 0x0000FFFFFFFFFFFF +}}, +{{ + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000 +}, +{ + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000 +}}, +{{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF +}}, diff --git a/crypto_kem/mceliece8192128/vec/sk_gen.c b/crypto_kem/mceliece8192128/vec/sk_gen.c new file mode 100644 index 00000000..d3859812 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE8192128_VEC_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE8192128_VEC_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE8192128_VEC_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE8192128_VEC_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE8192128_VEC_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE8192128_VEC_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE8192128_VEC_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE8192128_VEC_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128/vec/sk_gen.h b/crypto_kem/mceliece8192128/vec/sk_gen.h new file mode 100644 index 00000000..7e8d0d8c --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_SK_GEN_H +#define PQCLEAN_MCELIECE8192128_VEC_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE8192128_VEC_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE8192128_VEC_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/transpose.c b/crypto_kem/mceliece8192128/vec/transpose.c new file mode 100644 index 00000000..48a488a9 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/transpose.c @@ -0,0 +1,35 @@ +#include "transpose.h" + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} diff --git a/crypto_kem/mceliece8192128/vec/transpose.h b/crypto_kem/mceliece8192128/vec/transpose.h new file mode 100644 index 00000000..83fdcf27 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/transpose.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_TRANSPOSE_H +#define PQCLEAN_MCELIECE8192128_VEC_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + + +void PQCLEAN_MCELIECE8192128_VEC_transpose_64x64(uint64_t *out, const uint64_t *in); + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/util.c b/crypto_kem/mceliece8192128/vec/util.c new file mode 100644 index 00000000..f827f16c --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/util.c @@ -0,0 +1,76 @@ +#include "util.h" + +void PQCLEAN_MCELIECE8192128_VEC_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE8192128_VEC_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE8192128_VEC_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE8192128_VEC_irr_load(vec out[][GFBITS], const unsigned char *in) { + int i, j; + uint64_t v0 = 0, v1 = 0; + uint16_t irr[ SYS_T ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE8192128_VEC_load2(in + i * 2); + irr[i] &= GFMASK; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 63; j >= 0; j--) { + v0 <<= 1; + v1 <<= 1; + v0 |= (irr[j] >> i) & 1; + v1 |= (irr[j + 64] >> i) & 1; + } + + out[0][i] = v0; + out[1][i] = v1; + } +} + +void PQCLEAN_MCELIECE8192128_VEC_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE8192128_VEC_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} diff --git a/crypto_kem/mceliece8192128/vec/util.h b/crypto_kem/mceliece8192128/vec/util.h new file mode 100644 index 00000000..a6ce12c8 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/util.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_UTIL_H +#define PQCLEAN_MCELIECE8192128_VEC_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE8192128_VEC_store2(unsigned char *dest, uint16_t a); + +uint16_t PQCLEAN_MCELIECE8192128_VEC_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE8192128_VEC_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE8192128_VEC_irr_load(vec out[][GFBITS], const unsigned char *in); + +void PQCLEAN_MCELIECE8192128_VEC_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE8192128_VEC_load8(const unsigned char *in); + +#endif + diff --git a/crypto_kem/mceliece8192128/vec/vec.c b/crypto_kem/mceliece8192128/vec/vec.c new file mode 100644 index 00000000..6eabd459 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/vec.c @@ -0,0 +1,138 @@ +#include "vec.h" + +#include "params.h" + +vec PQCLEAN_MCELIECE8192128_VEC_vec_setbits(vec b) { + vec ret = -b; + + return ret; +} + +vec PQCLEAN_MCELIECE8192128_VEC_vec_set1_16b(uint16_t v) { + vec ret; + + ret = v; + ret |= ret << 16; + ret |= ret << 32; + + return ret; +} + +void PQCLEAN_MCELIECE8192128_VEC_vec_copy(vec *out, const vec *in) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i] = in[i]; + } +} + +vec PQCLEAN_MCELIECE8192128_VEC_vec_or_reduce(const vec *a) { + int i; + vec ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret |= a[i]; + } + + return ret; +} + +int PQCLEAN_MCELIECE8192128_VEC_vec_testz(vec a) { + a |= a >> 32; + a |= a >> 16; + a |= a >> 8; + a |= a >> 4; + a |= a >> 2; + a |= a >> 1; + + return (int)(a & 1) ^ 1; +} + +void PQCLEAN_MCELIECE8192128_VEC_vec_mul(vec *h, const vec *f, const vec *g) { + int i, j; + vec buf[ 2 * GFBITS - 1 ]; + + for (i = 0; i < 2 * GFBITS - 1; i++) { + buf[i] = 0; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < GFBITS; j++) { + buf[i + j] ^= f[i] & g[j]; + } + } + + for (i = 2 * GFBITS - 2; i >= GFBITS; i--) { + buf[i - GFBITS + 4] ^= buf[i]; + buf[i - GFBITS + 3] ^= buf[i]; + buf[i - GFBITS + 1] ^= buf[i]; + buf[i - GFBITS + 0] ^= buf[i]; + } + + for (i = 0; i < GFBITS; i++) { + h[i] = buf[i]; + } +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE8192128_VEC_vec_sq(vec *out, const vec *in) { + int i; + vec result[GFBITS], t; + + t = in[11] ^ in[12]; + + result[0] = in[0] ^ in[11]; + result[1] = in[7] ^ t; + result[2] = in[1] ^ in[7]; + result[3] = in[8] ^ t; + result[4] = in[2] ^ in[7]; + result[4] = result[4] ^ in[8]; + result[4] = result[4] ^ t; + result[5] = in[7] ^ in[9]; + result[6] = in[3] ^ in[8]; + result[6] = result[6] ^ in[9]; + result[6] = result[6] ^ in[12]; + result[7] = in[8] ^ in[10]; + result[8] = in[4] ^ in[9]; + result[8] = result[8] ^ in[10]; + result[9] = in[9] ^ in[11]; + result[10] = in[5] ^ in[10]; + result[10] = result[10] ^ in[11]; + result[11] = in[10] ^ in[12]; + result[12] = in[6] ^ t; + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE8192128_VEC_vec_inv(vec *out, const vec *in) { + vec tmp_11[ GFBITS ]; + vec tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE8192128_VEC_vec_copy(out, in); + + PQCLEAN_MCELIECE8192128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE8192128_VEC_vec_sq(out, tmp_11); + PQCLEAN_MCELIECE8192128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE8192128_VEC_vec_sq(out, tmp_1111); + PQCLEAN_MCELIECE8192128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE8192128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128_VEC_vec_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE8192128_VEC_vec_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece8192128/vec/vec.h b/crypto_kem/mceliece8192128/vec/vec.h new file mode 100644 index 00000000..b3699f23 --- /dev/null +++ b/crypto_kem/mceliece8192128/vec/vec.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE8192128_VEC_VEC_H +#define PQCLEAN_MCELIECE8192128_VEC_VEC_H + +#include "params.h" + +#include + +typedef uint64_t vec; + +vec PQCLEAN_MCELIECE8192128_VEC_vec_setbits(vec b); + +vec PQCLEAN_MCELIECE8192128_VEC_vec_set1_16b(uint16_t v); + +void PQCLEAN_MCELIECE8192128_VEC_vec_copy(vec *out, const vec *in); + +vec PQCLEAN_MCELIECE8192128_VEC_vec_or_reduce(const vec *a); + +int PQCLEAN_MCELIECE8192128_VEC_vec_testz(vec a); + +void PQCLEAN_MCELIECE8192128_VEC_vec_mul(vec * /*h*/, const vec * /*f*/, const vec * /*g*/); +void PQCLEAN_MCELIECE8192128_VEC_vec_sq(vec * /*out*/, const vec * /*in*/); +void PQCLEAN_MCELIECE8192128_VEC_vec_inv(vec * /*out*/, const vec * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/META.yml b/crypto_kem/mceliece8192128f/META.yml new file mode 100644 index 00000000..907ade49 --- /dev/null +++ b/crypto_kem/mceliece8192128f/META.yml @@ -0,0 +1,50 @@ +name: Classic McEliece 8192128f +type: kem +claimed-nist-level: 5 +claimed-security: IND-CCA2 +length-public-key: 1357824 +length-secret-key: 14080 +length-ciphertext: 240 +length-shared-secret: 32 +nistkat-sha256: 464f27c8eeef313c1bb024330fdc00125bbf0a28fccd9053e232a9cb0a1a0ac0 +principal-submitters: + - Daniel J. Bernstein + - Tung Chou + - Tanja Lange + - Ingo von Maurich + - Rafael Misoczki + - Ruben Niederhagen + - Edoardo Persichetti + - Christiane Peters + - Peter Schwabe + - Nicolas Sendrier + - Jakub Szefer + - Wen Wang +auxiliary-submitters: [] +implementations: + - name: clean + version: SUPERCOP-20191221 + - name: vec + version: SUPERCOP-20191221 + - name: sse + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - sse4_1 + - popcnt + - bmi + - name: avx + version: SUPERCOP-20191221 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - popcnt + - bmi diff --git a/crypto_kem/mceliece8192128f/avx/LICENSE b/crypto_kem/mceliece8192128f/avx/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece8192128f/avx/Makefile b/crypto_kem/mceliece8192128f/avx/Makefile new file mode 100644 index 00000000..658c61b7 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/Makefile @@ -0,0 +1,44 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece8192128f_avx.a + + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c int32_sort.c operations.c pk_gen.c sk_gen.c \ + transpose.c uint32_sort.c util.c vec128.c vec256.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S \ + transpose_64x256_sp_asm.S update_asm.S vec128_mul_asm.S \ + vec256_ama_asm.S vec256_maa_asm.S vec256_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h int32_sort.h \ + params.h pk_gen.h sk_gen.h transpose.h uint32_sort.h util.h vec128.h \ + vec256.h \ + consts.inc powers.inc scalars_2x.inc scalars_4x.inc + + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o int32_sort.o operations.o pk_gen.o sk_gen.o \ + transpose.o uint32_sort.o util.o vec128.o vec256.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + transpose_64x256_sp_asm.o update_asm.o vec128_mul_asm.o \ + vec256_ama_asm.o vec256_maa_asm.o vec256_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mbmi -mpopcnt -mavx2 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece8192128f/avx/aes256ctr.c b/crypto_kem/mceliece8192128f/avx/aes256ctr.c new file mode 100644 index 00000000..aa49695c --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE8192128F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece8192128f/avx/aes256ctr.h b/crypto_kem/mceliece8192128f/avx/aes256ctr.h new file mode 100644 index 00000000..e14e5c19 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_AES256CTR_H +#define PQCLEAN_MCELIECE8192128F_AVX_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE8192128F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece8192128f/avx/api.h b/crypto_kem/mceliece8192128f/avx/api.h new file mode 100644 index 00000000..3c8645bb --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_API_H +#define PQCLEAN_MCELIECE8192128F_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_ALGNAME "Classic McEliece 8192128f" +#define PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_PUBLICKEYBYTES 1357824 +#define PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_SECRETKEYBYTES 14080 +#define PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece8192128f/avx/benes.c b/crypto_kem/mceliece8192128f/avx/benes.c new file mode 100644 index 00000000..f371d057 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE8192128F_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(PQCLEAN_MCELIECE8192128F_AVX_load8(ptr), PQCLEAN_MCELIECE8192128F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE8192128F_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(PQCLEAN_MCELIECE8192128F_AVX_load8(ptr), PQCLEAN_MCELIECE8192128F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE8192128F_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece8192128f/avx/benes.h b/crypto_kem/mceliece8192128f/avx/benes.h new file mode 100644 index 00000000..b422e0f3 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_BENES_H +#define PQCLEAN_MCELIECE8192128F_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE8192128F_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE8192128F_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/avx/bm.c b/crypto_kem/mceliece8192128f/avx/bm.c new file mode 100644 index 00000000..9595edc4 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/bm.c @@ -0,0 +1,214 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE8192128F_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE8192128F_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE8192128F_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE8192128F_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = vec256_or(vec256_and(in[idx0], mask[0]), + vec256_sll_4x(vec256_and(in[idx1], mask[0]), s)); + + y = vec256_or(vec256_srl_4x(vec256_and(in[idx0], mask[1]), s), + vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = vec256_setzero(); + } + + mask[0][0] = vec256_set1_16b(0x5555); + mask[0][1] = vec256_set1_16b(0xAAAA); + mask[1][0] = vec256_set1_16b(0x3333); + mask[1][1] = vec256_set1_16b(0xCCCC); + mask[2][0] = vec256_set1_16b(0x0F0F); + mask[2][1] = vec256_set1_16b(0xF0F0); + mask[3][0] = vec256_set1_16b(0x00FF); + mask[3][1] = vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE8192128F_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + vec128 db[ GFBITS ][ 2 ]; + vec128 BC_tmp[ GFBITS ][ 2 ]; + vec128 BC[ GFBITS ][ 2 ]; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC[0][0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0, one << 63); + BC[0][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setzero(); + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setzero(); + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm(prod, interval, BC[0] + 1, 32); + PQCLEAN_MCELIECE8192128F_AVX_update_asm(interval, coefs[N], 16); + + d = PQCLEAN_MCELIECE8192128F_AVX_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE8192128F_AVX_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((d >> i) & 1); + db[i][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((b >> i) & 1); + } + + vec256_mul((vec256 *) BC_tmp, (vec256 *) db, (vec256 *) BC); + + vec128_cmov(BC, mask); + PQCLEAN_MCELIECE8192128F_AVX_update_asm(BC, c0 & mask, 32); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(BC_tmp[i][0], BC_tmp[i][1]); + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE8192128F_AVX_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + prod[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm(out, prod, BC[0] + 1, 32); +} + diff --git a/crypto_kem/mceliece8192128f/avx/bm.h b/crypto_kem/mceliece8192128f/avx/bm.h new file mode 100644 index 00000000..7450716d --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_BM_H +#define PQCLEAN_MCELIECE8192128F_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE8192128F_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/avx/consts.S b/crypto_kem/mceliece8192128f/avx/consts.S new file mode 100644 index 00000000..57ba3d8b --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE8192128F_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE8192128F_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE8192128F_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE8192128F_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE8192128F_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE8192128F_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE8192128F_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE8192128F_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE8192128F_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE8192128F_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE8192128F_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE8192128F_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece8192128f/avx/consts.inc b/crypto_kem/mceliece8192128f/avx/consts.inc new file mode 100644 index 00000000..bfbba9dd --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/crypto_kem/mceliece8192128f/avx/controlbits.c b/crypto_kem/mceliece8192128f/avx/controlbits.c new file mode 100644 index 00000000..c99b54fe --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE8192128F_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE8192128F_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE8192128F_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE8192128F_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece8192128f/avx/controlbits.h b/crypto_kem/mceliece8192128f/avx/controlbits.h new file mode 100644 index 00000000..cd2054cc --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE8192128F_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE8192128F_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE8192128F_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece8192128f/avx/crypto_hash.h b/crypto_kem/mceliece8192128f/avx/crypto_hash.h new file mode 100644 index 00000000..3f48edda --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece8192128f/avx/decrypt.c b/crypto_kem/mceliece8192128f/avx/decrypt.c new file mode 100644 index 00000000..8fefd4dd --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/decrypt.c @@ -0,0 +1,207 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE8192128F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE8192128F_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE8192128F_AVX_vec256_sq(eval[i], eval[i]); + } + + vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + vec256_mul(inv[i + 1], tmp, inv[i]); + vec256_mul(tmp, tmp, eval[i + 1]); + } + + vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = vec256_and(inv[i][j], recv[i]); + } + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + + recv[0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits(0); + + for (i = 1; i < 64; i++) { + recv[i] = recv[0]; + } + + for (i = 0; i < SYND_BYTES / 16; i++) { + recv[i] = PQCLEAN_MCELIECE8192128F_AVX_load16(s + i * 16); + } +} + +static int weight(vec256 *v) { + int i, w = 0; + + for (i = 0; i < 32; i++) { + w += (int)_mm_popcnt_u64( vec256_extract(v[i], 0) ); + w += (int)_mm_popcnt_u64( vec256_extract(v[i], 1) ); + w += (int)_mm_popcnt_u64( vec256_extract(v[i], 2) ); + w += (int)_mm_popcnt_u64( vec256_extract(v[i], 3) ); + } + + return w; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = vec256_or(diff, vec256_xor(s0[i], s1[i])); + } + + return vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = vec256_extract(in[i], 0); + v[1] = vec256_extract(in[i], 1); + v[2] = vec256_extract(in[i], 2); + v[3] = vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE8192128F_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 64 ][ GFBITS ]; + vec256 scaled[ 64 ][ GFBITS ]; + vec256 eval[ 64 ][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE8192128F_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE8192128F_AVX_benes(recv128, bits_int, 1); + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); // scaling + PQCLEAN_MCELIECE8192128F_AVX_fft_tr(s_priv, scaled); // transposed FFT + PQCLEAN_MCELIECE8192128F_AVX_bm(locator, s_priv); // Berlekamp Massey + + PQCLEAN_MCELIECE8192128F_AVX_fft(eval, locator); // FFT + + // reencryption and weight check + + allone = vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = vec256_or_reduce(eval[i]); + error256[i] = vec256_xor(error256[i], allone); + } + + check_weight = (uint16_t)(weight(error256) ^ SYS_T); + check_weight -= 1; + check_weight >>= 15; + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE8192128F_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE8192128F_AVX_benes(error128, bits_int, 0); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE8192128F_AVX_store16(e + i * 16, error128[i]); + } + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece8192128f/avx/decrypt.h b/crypto_kem/mceliece8192128f/avx/decrypt.h new file mode 100644 index 00000000..e28a0913 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE8192128F_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE8192128F_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/avx/encrypt.c b/crypto_kem/mceliece8192128f/avx/encrypt.c new file mode 100644 index 00000000..d2a032da --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/encrypt.c @@ -0,0 +1,80 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE8192128F_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq; + + uint16_t ind[ SYS_T ]; + int32_t ind32[ SYS_T ]; + uint64_t e_int[ SYS_N / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T; i++) { + ind32[i] = ind[i] &= GFMASK; + } + + // check for repetition + + PQCLEAN_MCELIECE8192128F_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind[j] & 63); + } + + for (i = 0; i < SYS_N / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < SYS_N / 64; i++) { + PQCLEAN_MCELIECE8192128F_AVX_store8(e + i * 8, e_int[i]); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE8192128F_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE8192128F_AVX_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece8192128f/avx/encrypt.h b/crypto_kem/mceliece8192128f/avx/encrypt.h new file mode 100644 index 00000000..177c6ae3 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE8192128F_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE8192128F_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/avx/fft.c b/crypto_kem/mceliece8192128f/avx/fft.c new file mode 100644 index 00000000..0cee28ae --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/fft.c @@ -0,0 +1,275 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE8192128F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE8192128F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec256 powers[ 32 ][ GFBITS ] = { +#include "powers.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = vec256_xor(buf.V[33], pre.V[0][i / 2]); + + + // transpose + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE8192128F_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 32; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = vec256_xor(out[i][b], powers[i][b]); + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE8192128F_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece8192128f/avx/fft.h b/crypto_kem/mceliece8192128f/avx/fft.h new file mode 100644 index 00000000..b2fea14f --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_FFT_H +#define PQCLEAN_MCELIECE8192128F_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE8192128F_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/avx/fft_tr.c b/crypto_kem/mceliece8192128f/avx/fft_tr.c new file mode 100644 index 00000000..ba1e57d0 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/fft_tr.c @@ -0,0 +1,379 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = vec256_and(in[i], mask[k][0]); + t = vec256_sll_4x(t, 1 << k); + in[i] = vec256_xor(in[i], t); + + t = vec256_and(in[i], mask[k][1]); + t = vec256_sll_4x(t, 1 << k); + in[i] = vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = vec256_extract(in[i], 0); + v[1] = vec256_extract(in[i], 1); + v[2] = vec256_extract(in[i], 2); + v[3] = vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = vec256_extract(in[i], 0); + v[1] = vec256_extract(in[i], 1); + v[2] = vec256_extract(in[i], 2); + v[3] = vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = vec256_extract2x(t, 1); + } + out128[i + 0][0] = vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][1], 1); + + out[b] = vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +void PQCLEAN_MCELIECE8192128F_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece8192128f/avx/fft_tr.h b/crypto_kem/mceliece8192128f/avx/fft_tr.h new file mode 100644 index 00000000..ea10712f --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE8192128F_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE8192128F_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece8192128f/avx/gf.c b/crypto_kem/mceliece8192128f/avx/gf.c new file mode 100644 index 00000000..b3cd9f20 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128F_AVX_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE8192128F_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE8192128F_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE8192128F_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE8192128F_AVX_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE8192128F_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE8192128F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE8192128F_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE8192128F_AVX_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE8192128F_AVX_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE8192128F_AVX_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece8192128f/avx/gf.h b/crypto_kem/mceliece8192128f/avx/gf.h new file mode 100644 index 00000000..2d58e7d3 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/gf.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_GF_H +#define PQCLEAN_MCELIECE8192128F_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE8192128F_AVX_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE8192128F_AVX_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE8192128F_AVX_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE8192128F_AVX_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE8192128F_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128F_AVX_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/crypto_kem/mceliece8192128f/avx/int32_sort.c b/crypto_kem/mceliece8192128f/avx/int32_sort.c new file mode 100644 index 00000000..9fd1b766 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/int32_sort.c @@ -0,0 +1,1208 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = (p << 1 == q); + flipflip = !flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE8192128F_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE8192128F_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/crypto_kem/mceliece8192128f/avx/int32_sort.h b/crypto_kem/mceliece8192128f/avx/int32_sort.h new file mode 100644 index 00000000..10e286fc --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE8192128F_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE8192128F_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece8192128f/avx/operations.c b/crypto_kem/mceliece8192128f/avx/operations.c new file mode 100644 index 00000000..ce461ca9 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE8192128F_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE8192128F_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE8192128F_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE8192128F_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE8192128F_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE8192128F_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE8192128F_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE8192128F_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE8192128F_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE8192128F_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128f/avx/params.h b/crypto_kem/mceliece8192128f/avx/params.h new file mode 100644 index 00000000..7e7766d6 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_PARAMS_H +#define PQCLEAN_MCELIECE8192128F_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 8192 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece8192128f/avx/pk_gen.c b/crypto_kem/mceliece8192128f/avx/pk_gen.c new file mode 100644 index 00000000..d0317682 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/pk_gen.c @@ -0,0 +1,355 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4]; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ 128 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] & 0x00000000FFFFFFFF) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] & 0xFFFFFFFF00000000) | (buf[j] >> 32); + } + } + + return 0; +} + + +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE8192128F_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ 128 ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE8192128F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE8192128F_AVX_fft(eval, sk_int); + + vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + vec256_mul(prod[i + 1], prod[i], tmp); + vec256_mul(tmp, tmp, eval[i + 1]); + } + + vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE8192128F_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < 128; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < 128; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < 128; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = NBLOCKS1_I; j < 128; j++) { + PQCLEAN_MCELIECE8192128F_AVX_store8(pk, mat[i][j]); + pk += 8; + } + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece8192128f/avx/pk_gen.h b/crypto_kem/mceliece8192128f/avx/pk_gen.h new file mode 100644 index 00000000..5620a761 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE8192128F_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE8192128F_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/avx/powers.inc b/crypto_kem/mceliece8192128f/avx/powers.inc new file mode 100644 index 00000000..6469fc4d --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/powers.inc @@ -0,0 +1,480 @@ +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +} diff --git a/crypto_kem/mceliece8192128f/avx/scalars_2x.inc b/crypto_kem/mceliece8192128f/avx/scalars_2x.inc new file mode 100644 index 00000000..5d664f2c --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece8192128f/avx/scalars_4x.inc b/crypto_kem/mceliece8192128f/avx/scalars_4x.inc new file mode 100644 index 00000000..dbdbed62 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/crypto_kem/mceliece8192128f/avx/sk_gen.c b/crypto_kem/mceliece8192128f/avx/sk_gen.c new file mode 100644 index 00000000..05f3bc14 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE8192128F_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE8192128F_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE8192128F_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE8192128F_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE8192128F_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE8192128F_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE8192128F_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE8192128F_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128f/avx/sk_gen.h b/crypto_kem/mceliece8192128f/avx/sk_gen.h new file mode 100644 index 00000000..5db0e861 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE8192128F_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE8192128F_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE8192128F_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/avx/syndrome_asm.S b/crypto_kem/mceliece8192128f/avx/syndrome_asm.S new file mode 100644 index 00000000..a7f2d653 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/syndrome_asm.S @@ -0,0 +1,910 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128F_AVX_syndrome_asm +.global PQCLEAN_MCELIECE8192128F_AVX_syndrome_asm +_PQCLEAN_MCELIECE8192128F_AVX_syndrome_asm: +PQCLEAN_MCELIECE8192128F_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 1357008 +# asm 1: add $1357008,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1664 +# asm 1: mov $1664,>row=int64#5 +# asm 2: mov $1664,>row=%r8 +mov $1664,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 208 ] +# asm 1: vmovupd 208(ee=reg256#2 +# asm 2: vmovupd 208(ee=%ymm1 +vmovupd 208(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 240 ] +# asm 1: vmovupd 240(ee=reg256#3 +# asm 2: vmovupd 240(ee=%ymm2 +vmovupd 240(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 272 ] +# asm 1: vmovupd 272(ee=reg256#3 +# asm 2: vmovupd 272(ee=%ymm2 +vmovupd 272(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 304 ] +# asm 1: vmovupd 304(ee=reg256#3 +# asm 2: vmovupd 304(ee=%ymm2 +vmovupd 304(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 336 ] +# asm 1: vmovupd 336(ee=reg256#3 +# asm 2: vmovupd 336(ee=%ymm2 +vmovupd 336(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 368 ] +# asm 1: vmovupd 368(ee=reg256#3 +# asm 2: vmovupd 368(ee=%ymm2 +vmovupd 368(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 400 ] +# asm 1: vmovupd 400(ee=reg256#3 +# asm 2: vmovupd 400(ee=%ymm2 +vmovupd 400(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 432 ] +# asm 1: vmovupd 432(ee=reg256#3 +# asm 2: vmovupd 432(ee=%ymm2 +vmovupd 432(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 464 ] +# asm 1: vmovupd 464(ee=reg256#3 +# asm 2: vmovupd 464(ee=%ymm2 +vmovupd 464(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 496 ] +# asm 1: vmovupd 496(ee=reg256#3 +# asm 2: vmovupd 496(ee=%ymm2 +vmovupd 496(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 528 ] +# asm 1: vmovupd 528(ee=reg256#3 +# asm 2: vmovupd 528(ee=%ymm2 +vmovupd 528(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 560 ] +# asm 1: vmovupd 560(ee=reg256#3 +# asm 2: vmovupd 560(ee=%ymm2 +vmovupd 560(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 592 ] +# asm 1: vmovupd 592(ee=reg256#3 +# asm 2: vmovupd 592(ee=%ymm2 +vmovupd 592(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 416(pp=%ymm1 +vmovupd 416(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 624 ] +# asm 1: vmovupd 624(ee=reg256#3 +# asm 2: vmovupd 624(ee=%ymm2 +vmovupd 624(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 448(pp=%ymm1 +vmovupd 448(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 656 ] +# asm 1: vmovupd 656(ee=reg256#3 +# asm 2: vmovupd 656(ee=%ymm2 +vmovupd 656(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 480(pp=%ymm1 +vmovupd 480(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 688 ] +# asm 1: vmovupd 688(ee=reg256#3 +# asm 2: vmovupd 688(ee=%ymm2 +vmovupd 688(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 512(pp=%ymm1 +vmovupd 512(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 720 ] +# asm 1: vmovupd 720(ee=reg256#3 +# asm 2: vmovupd 720(ee=%ymm2 +vmovupd 720(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 544(pp=%ymm1 +vmovupd 544(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 752 ] +# asm 1: vmovupd 752(ee=reg256#3 +# asm 2: vmovupd 752(ee=%ymm2 +vmovupd 752(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 576(pp=%ymm1 +vmovupd 576(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 784 ] +# asm 1: vmovupd 784(ee=reg256#3 +# asm 2: vmovupd 784(ee=%ymm2 +vmovupd 784(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 608(pp=%ymm1 +vmovupd 608(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 816 ] +# asm 1: vmovupd 816(ee=reg256#3 +# asm 2: vmovupd 816(ee=%ymm2 +vmovupd 816(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 640(pp=%ymm1 +vmovupd 640(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 848 ] +# asm 1: vmovupd 848(ee=reg256#3 +# asm 2: vmovupd 848(ee=%ymm2 +vmovupd 848(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 672(pp=%ymm1 +vmovupd 672(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 880 ] +# asm 1: vmovupd 880(ee=reg256#3 +# asm 2: vmovupd 880(ee=%ymm2 +vmovupd 880(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 704(pp=%ymm1 +vmovupd 704(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 912 ] +# asm 1: vmovupd 912(ee=reg256#3 +# asm 2: vmovupd 912(ee=%ymm2 +vmovupd 912(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 736(pp=%ymm1 +vmovupd 736(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 944 ] +# asm 1: vmovupd 944(ee=reg256#3 +# asm 2: vmovupd 944(ee=%ymm2 +vmovupd 944(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 768(pp=%ymm1 +vmovupd 768(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 976 ] +# asm 1: vmovupd 976(ee=reg256#3 +# asm 2: vmovupd 976(ee=%ymm2 +vmovupd 976(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = mem64[input_1 + 800] +# asm 1: movq 800(s=int64#6 +# asm 2: movq 800(s=%r9 +movq 800(%rsi),%r9 + +# qhasm: e = mem64[input_2 + 1008] +# asm 1: movq 1008(e=int64#7 +# asm 2: movq 1008(e=%rax +movq 1008(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 808(p=%rax +movq 808(%rsi),%rax + +# qhasm: e = mem64[input_2 + 1016] +# asm 1: movq 1016(e=int64#8 +# asm 2: movq 1016(e=%r10 +movq 1016(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 96(ss=%ymm0 +vmovupd 96(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 128(ss=%ymm0 +vmovupd 128(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#2 +# asm 2: vmovupd 128(ee=%ymm1 +vmovupd 128(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 160(ss=%ymm0 +vmovupd 160(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#2 +# asm 2: vmovupd 160(ee=%ymm1 +vmovupd 160(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor s=int64#2 +# asm 2: movq 192(s=%rsi +movq 192(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 192 ] +# asm 1: movq 192(e=int64#4 +# asm 2: movq 192(e=%rcx +movq 192(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 200(s=%rsi +movq 200(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 200 ] +# asm 1: movq 200(e=int64#3 +# asm 2: movq 200(e=%rdx +movq 200(%rdx),%rdx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE8192128F_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/crypto_kem/mceliece8192128f/avx/update_asm.S b/crypto_kem/mceliece8192128f/avx/update_asm.S new file mode 100644 index 00000000..89cccfdf --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128F_AVX_update_asm +.global PQCLEAN_MCELIECE8192128F_AVX_update_asm +_PQCLEAN_MCELIECE8192128F_AVX_update_asm: +PQCLEAN_MCELIECE8192128F_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE8192128F_AVX_store_i(unsigned char *out, uint64_t in, int i) { + for (int j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE8192128F_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE8192128F_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE8192128F_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE8192128F_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v0 = 0, v1 = 0; + uint16_t irr[ SYS_T ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE8192128F_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 63; j >= 0; j--) { + v0 <<= 1; + v1 <<= 1; + v0 |= (irr[j] >> i) & 1; + v1 |= (irr[j + 64] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(v0, v1); + } +} + +void PQCLEAN_MCELIECE8192128F_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE8192128F_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x( PQCLEAN_MCELIECE8192128F_AVX_load8(in), PQCLEAN_MCELIECE8192128F_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE8192128F_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE8192128F_AVX_store8(out + 0, PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE8192128F_AVX_store8(out + 8, PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece8192128f/avx/util.h b/crypto_kem/mceliece8192128f/avx/util.h new file mode 100644 index 00000000..6cf3988d --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_UTIL_H +#define PQCLEAN_MCELIECE8192128F_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE8192128F_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE8192128F_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE8192128F_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE8192128F_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE8192128F_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE8192128F_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE8192128F_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE8192128F_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE8192128F_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece8192128f/avx/vec128.c b/crypto_kem/mceliece8192128f/avx/vec128.c new file mode 100644 index 00000000..1d1e1516 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE8192128F_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE8192128F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE8192128F_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/crypto_kem/mceliece8192128f/avx/vec128.h b/crypto_kem/mceliece8192128f/avx/vec128.h new file mode 100644 index 00000000..c1aa6dca --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_VEC128_H +#define PQCLEAN_MCELIECE8192128F_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE8192128F_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE8192128F_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE8192128F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/crypto_kem/mceliece8192128f/avx/vec128_mul_asm.S b/crypto_kem/mceliece8192128f/avx/vec128_mul_asm.S new file mode 100644 index 00000000..77172c3b --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE8192128F_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE8192128F_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128F_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE8192128F_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE8192128F_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE8192128F_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/crypto_kem/mceliece8192128f/avx/vec256_ama_asm.S b/crypto_kem/mceliece8192128f/avx/vec256_ama_asm.S new file mode 100644 index 00000000..de644122 --- /dev/null +++ b/crypto_kem/mceliece8192128f/avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE8192128F_CLEAN_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece8192128f/clean/api.h b/crypto_kem/mceliece8192128f/clean/api.h new file mode 100644 index 00000000..9feabad8 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_API_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_API_H + +#include + +#define PQCLEAN_MCELIECE8192128F_CLEAN_CRYPTO_ALGNAME "Classic McEliece 8192128f" +#define PQCLEAN_MCELIECE8192128F_CLEAN_CRYPTO_PUBLICKEYBYTES 1357824 +#define PQCLEAN_MCELIECE8192128F_CLEAN_CRYPTO_SECRETKEYBYTES 14080 +#define PQCLEAN_MCELIECE8192128F_CLEAN_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE8192128F_CLEAN_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE8192128F_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE8192128F_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE8192128F_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece8192128f/clean/benes.c b/crypto_kem/mceliece8192128f/clean/benes.c new file mode 100644 index 00000000..cd4b2812 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/benes.c @@ -0,0 +1,180 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE8192128F_CLEAN_apply_benes(unsigned char *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + unsigned char *r_ptr = r; + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = PQCLEAN_MCELIECE8192128F_CLEAN_load8(r_ptr + i * 16 + 0); + r_int_v[1][i] = PQCLEAN_MCELIECE8192128F_CLEAN_load8(r_ptr + i * 16 + 8); + } + + PQCLEAN_MCELIECE8192128F_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE8192128F_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE8192128F_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE8192128F_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE8192128F_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE8192128F_CLEAN_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE8192128F_CLEAN_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128F_CLEAN_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE8192128F_CLEAN_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE8192128F_CLEAN_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE8192128F_CLEAN_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE8192128F_CLEAN_store8(r_ptr + i * 16 + 0, r_int_v[0][i]); + PQCLEAN_MCELIECE8192128F_CLEAN_store8(r_ptr + i * 16 + 8, r_int_v[1][i]); + } +} + +/* input: condition bits c */ +/* output: support s */ +void PQCLEAN_MCELIECE8192128F_CLEAN_support_gen(gf *s, const unsigned char *c) { + gf a; + int i, j; + unsigned char L[ GFBITS ][ (1 << GFBITS) / 8 ]; + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < (1 << GFBITS) / 8; j++) { + L[i][j] = 0; + } + } + + for (i = 0; i < (1 << GFBITS); i++) { + a = PQCLEAN_MCELIECE8192128F_CLEAN_bitrev((gf) i); + + for (j = 0; j < GFBITS; j++) { + L[j][ i / 8 ] |= ((a >> j) & 1) << (i % 8); + } + } + + for (j = 0; j < GFBITS; j++) { + PQCLEAN_MCELIECE8192128F_CLEAN_apply_benes(L[j], c, 0); + } + + for (i = 0; i < SYS_N; i++) { + s[i] = 0; + for (j = GFBITS - 1; j >= 0; j--) { + s[i] <<= 1; + s[i] |= (L[j][i / 8] >> (i % 8)) & 1; + } + } +} + diff --git a/crypto_kem/mceliece8192128f/clean/benes.h b/crypto_kem/mceliece8192128f/clean/benes.h new file mode 100644 index 00000000..d17cbc43 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/benes.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_BENES_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_BENES_H +/* + This file is for Benes network related functions +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE8192128F_CLEAN_apply_benes(uint8_t *r, const uint8_t *bits, int rev); +void PQCLEAN_MCELIECE8192128F_CLEAN_support_gen(gf *s, const uint8_t *c); + +#endif + diff --git a/crypto_kem/mceliece8192128f/clean/bm.c b/crypto_kem/mceliece8192128f/clean/bm.c new file mode 100644 index 00000000..a4db4b37 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/bm.c @@ -0,0 +1,83 @@ +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ +#include "bm.h" + +#include "params.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +/* the Berlekamp-Massey algorithm */ +/* input: s, sequence of field elements */ +/* output: out, minimal polynomial of s */ +void PQCLEAN_MCELIECE8192128F_CLEAN_bm(gf *out, gf *s) { + int i; + + uint16_t N = 0; + uint16_t L = 0; + uint16_t mle; + uint16_t mne; + + gf T[ SYS_T + 1 ]; + gf C[ SYS_T + 1 ]; + gf B[ SYS_T + 1 ]; + + gf b = 1, d, f; + + // + + for (i = 0; i < SYS_T + 1; i++) { + C[i] = B[i] = 0; + } + + B[1] = C[0] = 1; + + // + + for (N = 0; N < 2 * SYS_T; N++) { + d = 0; + + for (i = 0; i <= min(N, SYS_T); i++) { + d ^= PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(C[i], s[ N - i]); + } + + mne = d; + mne -= 1; + mne >>= 15; + mne -= 1; + mle = N; + mle -= 2 * L; + mle >>= 15; + mle -= 1; + mle &= mne; + + for (i = 0; i <= SYS_T; i++) { + T[i] = C[i]; + } + + f = PQCLEAN_MCELIECE8192128F_CLEAN_gf_frac(b, d); + + for (i = 0; i <= SYS_T; i++) { + C[i] ^= PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(f, B[i]) & mne; + } + + L = (L & ~mle) | ((N + 1 - L) & mle); + + for (i = 0; i <= SYS_T; i++) { + B[i] = (B[i] & ~mle) | (T[i] & mle); + } + + b = (b & ~mle) | (d & mle); + + for (i = SYS_T; i >= 1; i--) { + B[i] = B[i - 1]; + } + B[0] = 0; + } + + for (i = 0; i <= SYS_T; i++) { + out[i] = C[ SYS_T - i ]; + } +} + diff --git a/crypto_kem/mceliece8192128f/clean/bm.h b/crypto_kem/mceliece8192128f/clean/bm.h new file mode 100644 index 00000000..c8761c93 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/bm.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_BM_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_BM_H +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE8192128F_CLEAN_bm(gf * /*out*/, gf * /*s*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/clean/controlbits.c b/crypto_kem/mceliece8192128f/clean/controlbits.c new file mode 100644 index 00000000..6e419584 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE8192128F_CLEAN_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE8192128F_CLEAN_sort_63b(n / 2, x); + PQCLEAN_MCELIECE8192128F_CLEAN_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE8192128F_CLEAN_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece8192128f/clean/controlbits.h b/crypto_kem/mceliece8192128f/clean/controlbits.h new file mode 100644 index 00000000..6194e9b1 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_CONTROLBITS_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE8192128F_CLEAN_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE8192128F_CLEAN_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece8192128f/clean/crypto_hash.h b/crypto_kem/mceliece8192128f/clean/crypto_hash.h new file mode 100644 index 00000000..e0a36e9d --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece8192128f/clean/decrypt.c b/crypto_kem/mceliece8192128f/clean/decrypt.c new file mode 100644 index 00000000..640ed0a1 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/decrypt.c @@ -0,0 +1,90 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "gf.h" +#include "params.h" +#include "root.h" +#include "synd.h" +#include "util.h" + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE8192128F_CLEAN_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i, w = 0; + uint16_t check; + + unsigned char r[ SYS_N / 8 ]; + + gf g[ SYS_T + 1 ]; + gf L[ SYS_N ]; + + gf s[ SYS_T * 2 ]; + gf s_cmp[ SYS_T * 2 ]; + gf locator[ SYS_T + 1 ]; + gf images[ SYS_N ]; + + gf t; + + // + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = c[i]; + } + for (i = SYND_BYTES; i < SYS_N / 8; i++) { + r[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE8192128F_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + g[ SYS_T ] = 1; + + PQCLEAN_MCELIECE8192128F_CLEAN_support_gen(L, sk); + + PQCLEAN_MCELIECE8192128F_CLEAN_synd(s, g, L, r); + + PQCLEAN_MCELIECE8192128F_CLEAN_bm(locator, s); + + PQCLEAN_MCELIECE8192128F_CLEAN_root(images, locator, L); + + // + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + } + + for (i = 0; i < SYS_N; i++) { + t = PQCLEAN_MCELIECE8192128F_CLEAN_gf_iszero(images[i]) & 1; + + e[ i / 8 ] |= t << (i % 8); + w += t; + + } + + PQCLEAN_MCELIECE8192128F_CLEAN_synd(s_cmp, g, L, e); + + // + + check = (uint16_t)w; + check ^= SYS_T; + + for (i = 0; i < SYS_T * 2; i++) { + check |= s[i] ^ s_cmp[i]; + } + + check -= 1; + check >>= 15; + + return check ^ 1; +} + diff --git a/crypto_kem/mceliece8192128f/clean/decrypt.h b/crypto_kem/mceliece8192128f/clean/decrypt.h new file mode 100644 index 00000000..e3ebdf93 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_DECRYPT_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE8192128F_CLEAN_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/clean/encrypt.c b/crypto_kem/mceliece8192128f/clean/encrypt.c new file mode 100644 index 00000000..9595651c --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/encrypt.c @@ -0,0 +1,126 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +static inline uint8_t same_mask(uint16_t x, uint16_t y) { + uint32_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 31; + mask = -mask; + + return (uint8_t)mask; +} + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq; + + uint16_t ind[ SYS_T ]; + uint8_t *ind8 = (uint8_t *)ind; + uint8_t mask; + unsigned char val[ SYS_T ]; + + while (1) { + randombytes(ind8, sizeof(ind)); + // Copy to uint16_t ind in a little-endian way + for (i = 0; i < sizeof(ind); i += 2) { + ind[i / 2] = ind8[i + 1] << 8 | ind8[i]; + } + + for (i = 0; i < SYS_T; i++) { + ind[i] &= GFMASK; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = 1 << (ind[j] & 7); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = same_mask((uint16_t)i, (ind[j] >> 3)); + + e[i] |= val[j] & mask; + } + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + unsigned char b, row[SYS_N / 8]; + const unsigned char *pk_ptr = pk; + + int i, j; + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = 0; + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + row[j] = 0; + } + + for (j = 0; j < PK_ROW_BYTES; j++) { + row[ SYS_N / 8 - PK_ROW_BYTES + j ] = pk_ptr[j]; + } + + row[i / 8] |= 1 << (i % 8); + + b = 0; + for (j = 0; j < SYS_N / 8; j++) { + b ^= row[j] & e[j]; + } + + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] |= (b << (i % 8)); + + pk_ptr += PK_ROW_BYTES; + } +} + +void PQCLEAN_MCELIECE8192128F_CLEAN_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece8192128f/clean/encrypt.h b/crypto_kem/mceliece8192128f/clean/encrypt.h new file mode 100644 index 00000000..5ac83efc --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_ENCRYPT_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE8192128F_CLEAN_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/clean/gf.c b/crypto_kem/mceliece8192128f/clean/gf.c new file mode 100644 index 00000000..4d83bb94 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/gf.c @@ -0,0 +1,210 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE8192128F_CLEAN_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE8192128F_CLEAN_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* input: field element in */ +/* return: (in^2)^2 */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: (in^2)*m */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element in, m */ +/* return: ((in^2)^2)*m */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE8192128F_CLEAN_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +gf PQCLEAN_MCELIECE8192128F_CLEAN_gf_inv(gf in) { + return PQCLEAN_MCELIECE8192128F_CLEAN_gf_frac(in, ((gf) 1)); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE8192128F_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(prod[i], (gf) 7682); + prod[i - SYS_T + 3] ^= PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(prod[i], (gf) 2159); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece8192128f/clean/gf.h b/crypto_kem/mceliece8192128f/clean/gf.h new file mode 100644 index 00000000..3e92cbb1 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_GF_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE8192128F_CLEAN_gf_iszero(gf a); +gf PQCLEAN_MCELIECE8192128F_CLEAN_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(gf in0, gf in1); +gf PQCLEAN_MCELIECE8192128F_CLEAN_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE8192128F_CLEAN_gf_inv(gf in); +uint64_t PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul2(gf a, gf b0, gf b1); + +void PQCLEAN_MCELIECE8192128F_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/crypto_kem/mceliece8192128f/clean/operations.c b/crypto_kem/mceliece8192128f/clean/operations.c new file mode 100644 index 00000000..6f3a460a --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE8192128F_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE8192128F_CLEAN_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128F_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE8192128F_CLEAN_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128F_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE8192128F_CLEAN_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE8192128F_CLEAN_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE8192128F_CLEAN_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE8192128F_CLEAN_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE8192128F_CLEAN_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE8192128F_CLEAN_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE8192128F_CLEAN_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE8192128F_CLEAN_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128f/clean/params.h b/crypto_kem/mceliece8192128f/clean/params.h new file mode 100644 index 00000000..8b398a32 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_PARAMS_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_PARAMS_H + +#define GFBITS 13 +#define SYS_N 8192 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece8192128f/clean/pk_gen.c b/crypto_kem/mceliece8192128f/clean/pk_gen.c new file mode 100644 index 00000000..701b0a8c --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/pk_gen.c @@ -0,0 +1,294 @@ +/* + This file is for public-key generation +*/ + +#include +#include +#include +#include + +#include "controlbits.h" +#include "benes.h" +#include "params.h" +#include "pk_gen.h" +#include "root.h" +#include "util.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + int i, b, m = 0, r = 0; + + for (i = 0; i < 64; i++) { + b = (int)(in >> i) & 1; + m |= b; + r += (m ^ 1) & (b ^ 1); + } + + return r; +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint8_t mat[][ SYS_N / 8 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 8; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = PQCLEAN_MCELIECE8192128F_CLEAN_load8( &mat[ row + i ][ block_idx ] ); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = PQCLEAN_MCELIECE8192128F_CLEAN_load8( &mat[ i + j ][ block_idx ] ); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + PQCLEAN_MCELIECE8192128F_CLEAN_store8( &mat[ i + j ][ block_idx ], buf[j] ); + } + } + + return 0; +} + +/* input: secret key sk */ +/* output: public key pk */ +int PQCLEAN_MCELIECE8192128F_CLEAN_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) { + int i, j, k; + int row, c; + + uint64_t buf[ 1 << GFBITS ]; + + unsigned char mat[ GFBITS * SYS_T ][ SYS_N / 8 ]; + unsigned char mask; + unsigned char b; + + gf g[ SYS_T + 1 ]; // Goppa polynomial + gf L[ SYS_N ]; // support + gf inv[ SYS_N ]; + + // + + g[ SYS_T ] = 1; + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE8192128F_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + + for (i = 0; i < (1 << GFBITS); i++) { + buf[i] = perm[i]; + buf[i] <<= 31; + buf[i] |= i; + } + + PQCLEAN_MCELIECE8192128F_CLEAN_sort_63b(1 << GFBITS, buf); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = buf[i] & GFMASK; + } + for (i = 0; i < SYS_N; i++) { + L[i] = PQCLEAN_MCELIECE8192128F_CLEAN_bitrev((gf)perm[i]); + } + + // filling the matrix + + PQCLEAN_MCELIECE8192128F_CLEAN_root(inv, g, L); + + for (i = 0; i < SYS_N; i++) { + inv[i] = PQCLEAN_MCELIECE8192128F_CLEAN_gf_inv(inv[i]); + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + mat[i][j] = 0; + } + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_N; j += 8) { + for (k = 0; k < GFBITS; k++) { + b = (inv[j + 7] >> k) & 1; + b <<= 1; + b |= (inv[j + 6] >> k) & 1; + b <<= 1; + b |= (inv[j + 5] >> k) & 1; + b <<= 1; + b |= (inv[j + 4] >> k) & 1; + b <<= 1; + b |= (inv[j + 3] >> k) & 1; + b <<= 1; + b |= (inv[j + 2] >> k) & 1; + b <<= 1; + b |= (inv[j + 1] >> k) & 1; + b <<= 1; + b |= (inv[j + 0] >> k) & 1; + + mat[ i * GFBITS + k ][ j / 8 ] = b; + } + } + + for (j = 0; j < SYS_N; j++) { + inv[j] = PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(inv[j], L[j]); + } + + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) { + for (j = 0; j < 8; j++) { + row = i * 8 + j; + + if (row >= GFBITS * SYS_T) { + break; + } + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] ^ mat[ k ][ i ]; + mask >>= j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < GFBITS * SYS_T; k++) { + if (k != row) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + } + + for (i = 0; i < PK_NROWS; i++) { + memcpy(pk + i * PK_ROW_BYTES, mat[i] + PK_NROWS / 8, PK_ROW_BYTES); + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128f/clean/pk_gen.h b/crypto_kem/mceliece8192128f/clean/pk_gen.h new file mode 100644 index 00000000..f25b7799 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_PK_GEN_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE8192128F_CLEAN_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/clean/root.c b/crypto_kem/mceliece8192128f/clean/root.c new file mode 100644 index 00000000..3c1b8b0c --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/root.c @@ -0,0 +1,33 @@ +/* + This file is for evaluating a polynomial at one or more field elements +*/ +#include "root.h" + +#include "params.h" + +/* input: polynomial f and field element a */ +/* return f(a) */ +gf PQCLEAN_MCELIECE8192128F_CLEAN_eval(gf *f, gf a) { + int i; + gf r; + + r = f[ SYS_T ]; + + for (i = SYS_T - 1; i >= 0; i--) { + r = PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(r, a); + r = PQCLEAN_MCELIECE8192128F_CLEAN_gf_add(r, f[i]); + } + + return r; +} + +/* input: polynomial f and list of field elements L */ +/* output: out = [ f(a) for a in L ] */ +void PQCLEAN_MCELIECE8192128F_CLEAN_root(gf *out, gf *f, gf *L) { + int i; + + for (i = 0; i < SYS_N; i++) { + out[i] = PQCLEAN_MCELIECE8192128F_CLEAN_eval(f, L[i]); + } +} + diff --git a/crypto_kem/mceliece8192128f/clean/root.h b/crypto_kem/mceliece8192128f/clean/root.h new file mode 100644 index 00000000..1d63d92b --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/root.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_ROOT_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_ROOT_H +/* + This file is for evaluating a polynomial at one or more field elements +*/ + + +#include "gf.h" + +gf PQCLEAN_MCELIECE8192128F_CLEAN_eval(gf * /*f*/, gf /*a*/); +void PQCLEAN_MCELIECE8192128F_CLEAN_root(gf * /*out*/, gf * /*f*/, gf * /*L*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/clean/sk_gen.c b/crypto_kem/mceliece8192128f/clean/sk_gen.c new file mode 100644 index 00000000..521602c7 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE8192128F_CLEAN_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE8192128F_CLEAN_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE8192128F_CLEAN_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE8192128F_CLEAN_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE8192128F_CLEAN_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE8192128F_CLEAN_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128f/clean/sk_gen.h b/crypto_kem/mceliece8192128f/clean/sk_gen.h new file mode 100644 index 00000000..87a5640f --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_SK_GEN_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE8192128F_CLEAN_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE8192128F_CLEAN_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/clean/synd.c b/crypto_kem/mceliece8192128f/clean/synd.c new file mode 100644 index 00000000..f9ab2685 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/synd.c @@ -0,0 +1,33 @@ +/* + This file is for syndrome computation +*/ + +#include "synd.h" + +#include "params.h" +#include "root.h" + + +/* input: Goppa polynomial f, support L, received word r */ +/* output: out, the syndrome of length 2t */ +void PQCLEAN_MCELIECE8192128F_CLEAN_synd(gf *out, gf *f, gf *L, const unsigned char *r) { + int i, j; + gf e, e_inv, c; + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = 0; + } + + for (i = 0; i < SYS_N; i++) { + c = (r[i / 8] >> (i % 8)) & 1; + + e = PQCLEAN_MCELIECE8192128F_CLEAN_eval(f, L[i]); + e_inv = PQCLEAN_MCELIECE8192128F_CLEAN_gf_inv(PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(e, e)); + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = PQCLEAN_MCELIECE8192128F_CLEAN_gf_add(out[j], PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(e_inv, c)); + e_inv = PQCLEAN_MCELIECE8192128F_CLEAN_gf_mul(e_inv, L[i]); + } + } +} + diff --git a/crypto_kem/mceliece8192128f/clean/synd.h b/crypto_kem/mceliece8192128f/clean/synd.h new file mode 100644 index 00000000..d84afef6 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/synd.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_SYND_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_SYND_H +/* + This file is for syndrome computation +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE8192128F_CLEAN_synd(gf * /*out*/, gf * /*f*/, gf * /*L*/, const unsigned char * /*r*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/clean/transpose.c b/crypto_kem/mceliece8192128f/clean/transpose.c new file mode 100644 index 00000000..9e1351fc --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/transpose.c @@ -0,0 +1,42 @@ +/* + This file is for matrix transposition +*/ + +#include "transpose.h" + +#include + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE8192128F_CLEAN_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + diff --git a/crypto_kem/mceliece8192128f/clean/transpose.h b/crypto_kem/mceliece8192128f/clean/transpose.h new file mode 100644 index 00000000..737fe9a6 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/transpose.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_TRANSPOSE_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + +void PQCLEAN_MCELIECE8192128F_CLEAN_transpose_64x64(uint64_t * /*out*/, const uint64_t * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/clean/util.c b/crypto_kem/mceliece8192128f/clean/util.c new file mode 100644 index 00000000..fa4608d8 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/util.c @@ -0,0 +1,67 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ + +#include "util.h" + +#include "params.h" + +void PQCLEAN_MCELIECE8192128F_CLEAN_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE8192128F_CLEAN_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE8192128F_CLEAN_load4(const unsigned char *in) { + int i; + uint32_t ret = in[3]; + + for (i = 2; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +void PQCLEAN_MCELIECE8192128F_CLEAN_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE8192128F_CLEAN_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE8192128F_CLEAN_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 3; +} + diff --git a/crypto_kem/mceliece8192128f/clean/util.h b/crypto_kem/mceliece8192128f/clean/util.h new file mode 100644 index 00000000..c81da211 --- /dev/null +++ b/crypto_kem/mceliece8192128f/clean/util.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE8192128F_CLEAN_UTIL_H +#define PQCLEAN_MCELIECE8192128F_CLEAN_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include + +void PQCLEAN_MCELIECE8192128F_CLEAN_store2(unsigned char * /*dest*/, gf /*a*/); +uint16_t PQCLEAN_MCELIECE8192128F_CLEAN_load2(const unsigned char * /*src*/); + +uint32_t PQCLEAN_MCELIECE8192128F_CLEAN_load4(const unsigned char * /*in*/); + +void PQCLEAN_MCELIECE8192128F_CLEAN_store8(unsigned char * /*out*/, uint64_t /*in*/); +uint64_t PQCLEAN_MCELIECE8192128F_CLEAN_load8(const unsigned char * /*in*/); + +gf PQCLEAN_MCELIECE8192128F_CLEAN_bitrev(gf /*a*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/sse/LICENSE b/crypto_kem/mceliece8192128f/sse/LICENSE new file mode 100644 index 00000000..eba3e7ce --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/crypto_kem/mceliece8192128f/sse/Makefile b/crypto_kem/mceliece8192128f/sse/Makefile new file mode 100644 index 00000000..6577efb1 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/Makefile @@ -0,0 +1,37 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB = libmceliece8192128f_sse.a + +SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ + fft_tr.c gf.c operations.c pk_gen.c sk_gen.c util.c vec128.c \ + consts.S syndrome_asm.S transpose_64x128_sp_asm.S update_asm.S \ + vec128_mul_asm.S vec_reduce_asm.S + +HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ + decrypt.h encrypt.h fft.h fft_tr.h gf.h params.h \ + pk_gen.h sk_gen.h transpose.h util.h vec128.h \ + consts.inc scalars_2x.inc scalars_4x.inc powers.data + +OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ + fft_tr.o gf.o operations.o pk_gen.o sk_gen.o util.o vec128.o \ + consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ + update_asm.o vec128_mul_asm.o vec_reduce_asm.o + +CFLAGS = -O3 -std=c99 -mbmi -mpopcnt -msse4.1 -Wall -Wextra -pedantic -Werror -Wpedantic \ + -Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ + -I../../../common/ $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S + $(CC) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/mceliece8192128f/sse/aes256ctr.c b/crypto_kem/mceliece8192128f/sse/aes256ctr.c new file mode 100644 index 00000000..bc25c9d8 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE8192128F_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/crypto_kem/mceliece8192128f/sse/aes256ctr.h b/crypto_kem/mceliece8192128f/sse/aes256ctr.h new file mode 100644 index 00000000..7c0badba --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_AES256CTR_H +#define PQCLEAN_MCELIECE8192128F_SSE_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE8192128F_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece8192128f/sse/api.h b/crypto_kem/mceliece8192128f/sse/api.h new file mode 100644 index 00000000..7c07b7ec --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_API_H +#define PQCLEAN_MCELIECE8192128F_SSE_API_H + +#include + +#define PQCLEAN_MCELIECE8192128F_SSE_CRYPTO_ALGNAME "Classic McEliece 8192128f" +#define PQCLEAN_MCELIECE8192128F_SSE_CRYPTO_PUBLICKEYBYTES 1357824 +#define PQCLEAN_MCELIECE8192128F_SSE_CRYPTO_SECRETKEYBYTES 14080 +#define PQCLEAN_MCELIECE8192128F_SSE_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE8192128F_SSE_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE8192128F_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE8192128F_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE8192128F_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece8192128f/sse/benes.c b/crypto_kem/mceliece8192128f/sse/benes.c new file mode 100644 index 00000000..3b4583e8 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE8192128F_SSE_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(PQCLEAN_MCELIECE8192128F_SSE_load8(ptr), PQCLEAN_MCELIECE8192128F_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE8192128F_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE8192128F_SSE_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(PQCLEAN_MCELIECE8192128F_SSE_load8(ptr), PQCLEAN_MCELIECE8192128F_SSE_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE8192128F_SSE_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE8192128F_SSE_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE8192128F_SSE_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE8192128F_SSE_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE8192128F_SSE_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE8192128F_SSE_transpose_64x128_sp( r ); +} + diff --git a/crypto_kem/mceliece8192128f/sse/benes.h b/crypto_kem/mceliece8192128f/sse/benes.h new file mode 100644 index 00000000..fe2c79b1 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_BENES_H +#define PQCLEAN_MCELIECE8192128F_SSE_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE8192128F_SSE_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE8192128F_SSE_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/sse/bm.c b/crypto_kem/mceliece8192128f/sse/bm.c new file mode 100644 index 00000000..d8a57e12 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/bm.c @@ -0,0 +1,208 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +extern gf PQCLEAN_MCELIECE8192128F_SSE_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE8192128F_SSE_update_asm(vec128 *, gf); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 *out, vec128 *in, uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE8192128F_SSE_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE8192128F_SSE_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(in[i], m0); + v1 = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(out[i], m1); + out[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_or(v0, v1); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE8192128F_SSE_vec128_or(PQCLEAN_MCELIECE8192128F_SSE_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE8192128F_SSE_vec128_sll_2x(PQCLEAN_MCELIECE8192128F_SSE_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE8192128F_SSE_vec128_or(PQCLEAN_MCELIECE8192128F_SSE_vec128_srl_2x(PQCLEAN_MCELIECE8192128F_SSE_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE8192128F_SSE_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE8192128F_SSE_bm(vec128 *out, vec128 in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + vec128 dd[ GFBITS ], bb[ GFBITS ]; + vec128 B[ GFBITS ], C[ GFBITS ]; + vec128 B_tmp[ GFBITS ], C_tmp[ GFBITS ]; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[128], in[1]); + + C[0] = PQCLEAN_MCELIECE8192128F_SSE_vec128_setzero(); + B[0] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0, one << 63); + + for (i = 1; i < GFBITS; i++) { + C[i] = B[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_setzero(); + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(prod, C, (vec128 *) interval); + PQCLEAN_MCELIECE8192128F_SSE_update_asm(interval, coefs[N]); + d = PQCLEAN_MCELIECE8192128F_SSE_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE8192128F_SSE_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_setbits((d >> i) & 1); + bb[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(B_tmp, dd, B); + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(C_tmp, bb, C); + + vec128_cmov(B, C, mask); + PQCLEAN_MCELIECE8192128F_SSE_update_asm(B, c0 & mask); + + for (i = 0; i < GFBITS; i++) { + C[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(B_tmp[i], C_tmp[i]); + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE8192128F_SSE_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + out[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(out, out, C); +} + diff --git a/crypto_kem/mceliece8192128f/sse/bm.h b/crypto_kem/mceliece8192128f/sse/bm.h new file mode 100644 index 00000000..b5f0816f --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/bm.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_BM_H +#define PQCLEAN_MCELIECE8192128F_SSE_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE8192128F_SSE_bm(vec128 * /*out*/, vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece8192128f/sse/consts.S b/crypto_kem/mceliece8192128f/sse/consts.S new file mode 100644 index 00000000..9a16e5c7 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/consts.S @@ -0,0 +1,32 @@ +.data + +# not supported on macos +#.section .rodata +.globl PQCLEAN_MCELIECE8192128F_SSE_MASK0_0 +.globl PQCLEAN_MCELIECE8192128F_SSE_MASK0_1 +.globl PQCLEAN_MCELIECE8192128F_SSE_MASK1_0 +.globl PQCLEAN_MCELIECE8192128F_SSE_MASK1_1 +.globl PQCLEAN_MCELIECE8192128F_SSE_MASK2_0 +.globl PQCLEAN_MCELIECE8192128F_SSE_MASK2_1 +.globl PQCLEAN_MCELIECE8192128F_SSE_MASK3_0 +.globl PQCLEAN_MCELIECE8192128F_SSE_MASK3_1 +.globl PQCLEAN_MCELIECE8192128F_SSE_MASK4_0 +.globl PQCLEAN_MCELIECE8192128F_SSE_MASK4_1 +.globl PQCLEAN_MCELIECE8192128F_SSE_MASK5_0 +.globl PQCLEAN_MCELIECE8192128F_SSE_MASK5_1 + +.p2align 4 + +PQCLEAN_MCELIECE8192128F_SSE_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE8192128F_SSE_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE8192128F_SSE_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE8192128F_SSE_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE8192128F_SSE_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE8192128F_SSE_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE8192128F_SSE_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE8192128F_SSE_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE8192128F_SSE_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE8192128F_SSE_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE8192128F_SSE_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE8192128F_SSE_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/crypto_kem/mceliece8192128f/sse/consts.inc b/crypto_kem/mceliece8192128f/sse/consts.inc new file mode 100644 index 00000000..3cbb5885 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/consts.inc @@ -0,0 +1,967 @@ +// 64 +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +// 128 +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), +}, +// 256 +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9669966969966996, 0X6996699696699669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), +}, +// 512 +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9966996699669966, 0X9966996699669966), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6666999999996666, 0X9999666666669999), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9999666666669999, 0X6666999999996666), +}, +// 1024 +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996699669966996, 0X9669966996699669), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9669966996699669, 0X6996699669966996), +}, +// 2048 +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6996966996696996, 0X6996966996696996), +}, +// 4096 +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), +} diff --git a/crypto_kem/mceliece8192128f/sse/controlbits.c b/crypto_kem/mceliece8192128f/sse/controlbits.c new file mode 100644 index 00000000..b639c5d4 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE8192128F_SSE_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE8192128F_SSE_sort_63b(n / 2, x); + PQCLEAN_MCELIECE8192128F_SSE_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE8192128F_SSE_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece8192128f/sse/controlbits.h b/crypto_kem/mceliece8192128f/sse/controlbits.h new file mode 100644 index 00000000..7a46a4ea --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_CONTROLBITS_H +#define PQCLEAN_MCELIECE8192128F_SSE_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE8192128F_SSE_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE8192128F_SSE_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece8192128f/sse/crypto_hash.h b/crypto_kem/mceliece8192128f/sse/crypto_hash.h new file mode 100644 index 00000000..1ecbb8bd --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE8192128F_SSE_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece8192128f/sse/decrypt.c b/crypto_kem/mceliece8192128f/sse/decrypt.c new file mode 100644 index 00000000..c1d6283a --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/decrypt.c @@ -0,0 +1,175 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec128 out[][GFBITS], vec128 inv[][GFBITS], const unsigned char *sk, vec128 *recv) { + int i, j; + + vec128 irr_int[ GFBITS ]; + vec128 eval[64][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE8192128F_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE8192128F_SSE_fft(eval, irr_int); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE8192128F_SSE_vec128_copy(inv[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128F_SSE_vec128_inv(tmp, inv[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128F_SSE_vec128_copy(inv[0], tmp); + + // + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static void scaling_inv(vec128 out[][GFBITS], vec128 inv[][GFBITS], vec128 *recv) { + int i, j; + + for (i = 0; i < 64; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + + recv[0] = PQCLEAN_MCELIECE8192128F_SSE_vec128_setbits(0); + + for (i = 1; i < 64; i++) { + recv[i] = recv[0]; + } + + for (i = 0; i < SYND_BYTES / 16; i++) { + recv[i] = PQCLEAN_MCELIECE8192128F_SSE_load16(s + i * 16); + } +} + +static uint16_t weight(vec128 *v) { + uint16_t i, w = 0; + + for (i = 0; i < 64; i++) { + w += (uint16_t)_mm_popcnt_u64(PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(v[i], 0) ); + w += (uint16_t)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(v[i], 1) ); + } + + return w; +} + +static uint16_t synd_cmp(vec128 s0[][ GFBITS ], vec128 s1[][ GFBITS ]) { + int i, j; + vec128 diff; + + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_or(PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(s0[0][0], s1[0][0]), + PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(s0[1][0], s1[1][0])); + + for (i = 0; i < 2; i++) { + for (j = 1; j < GFBITS; j++) { + diff = PQCLEAN_MCELIECE8192128F_SSE_vec128_or(diff, PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(s0[i][j], s1[i][j])); + } + } + + return (uint16_t)PQCLEAN_MCELIECE8192128F_SSE_vec128_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE8192128F_SSE_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec128 inv[ 64 ][ GFBITS ]; + vec128 scaled[ 64 ][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + + vec128 error[ 64 ]; + + vec128 s_priv[ 2 ][ GFBITS ]; + vec128 s_priv_cmp[ 2 ][ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv[ 64 ]; + vec128 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE8192128F_SSE_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE8192128F_SSE_benes(recv, bits_int, 1); + + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE8192128F_SSE_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE8192128F_SSE_bm(locator, s_priv); + + PQCLEAN_MCELIECE8192128F_SSE_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE8192128F_SSE_vec128_setbits(1); + + for (i = 0; i < 64; i++) { + error[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_or_reduce(eval[i]); + error[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(error[i], allone); + } + + check_weight = weight(error) ^ SYS_T; + check_weight -= 1; + check_weight >>= 15; + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE8192128F_SSE_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE8192128F_SSE_benes(error, bits_int, 0); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE8192128F_SSE_store16(e + i * 16, error[i]); + } + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece8192128f/sse/decrypt.h b/crypto_kem/mceliece8192128f/sse/decrypt.h new file mode 100644 index 00000000..6aba1717 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_DECRYPT_H +#define PQCLEAN_MCELIECE8192128F_SSE_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE8192128F_SSE_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/sse/encrypt.c b/crypto_kem/mceliece8192128f/sse/encrypt.c new file mode 100644 index 00000000..f9ac9742 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/encrypt.c @@ -0,0 +1,84 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE8192128F_SSE_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq; + + uint16_t ind[ SYS_T ]; + uint64_t e_int[ SYS_N / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *)ind, sizeof(ind)); + + for (i = 0; i < SYS_T; i++) { + ind[i] &= GFMASK; + } + + for (i = 0; i < SYS_T; i++) { + ind[i] &= GFMASK; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind[j] & 63); + } + + for (i = 0; i < SYS_N / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < SYS_N / 64; i++) { + PQCLEAN_MCELIECE8192128F_SSE_store8(e + i * 8, e_int[i]); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE8192128F_SSE_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE8192128F_SSE_syndrome_asm(s, pk, e); +} + diff --git a/crypto_kem/mceliece8192128f/sse/encrypt.h b/crypto_kem/mceliece8192128f/sse/encrypt.h new file mode 100644 index 00000000..a2a937c5 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_ENCRYPT_H +#define PQCLEAN_MCELIECE8192128F_SSE_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE8192128F_SSE_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/sse/fft.c b/crypto_kem/mceliece8192128f/sse/fft.c new file mode 100644 index 00000000..41bae8a3 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/fft.c @@ -0,0 +1,243 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec128.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE8192128F_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE8192128F_SSE_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(in, in, s[j]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec128 out[][ GFBITS ], const vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec128 t[ GFBITS ]; + vec128 pre[8][ GFBITS ]; + vec128 buf[64]; + + uint64_t v0, v1; + uint64_t consts_ptr = 1; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec128 powers[ 64 ][ GFBITS ] = { +#include "powers.inc" + }; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre[i + 0][j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_low(tmp[j], tmp[j]); + pre[i + 1][j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(in[i], 0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(in[i], 0) ^ PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(pre[6][i], 0)); + + buf[1] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[0], pre[0][i]); + buf[16] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[0], pre[4][i]); + buf[3] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[1], pre[1][i]); + buf[48] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[16], pre[5][i]); + buf[49] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[48], pre[0][i]); + buf[2] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[0], pre[1][i]); + buf[51] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[49], pre[1][i]); + buf[6] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[2], pre[2][i]); + buf[50] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[51], pre[0][i]); + buf[7] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[6], pre[0][i]); + buf[54] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[50], pre[2][i]); + buf[5] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[7], pre[1][i]); + buf[55] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[54], pre[0][i]); + buf[53] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[55], pre[1][i]); + buf[4] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[0], pre[2][i]); + buf[52] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[53], pre[0][i]); + buf[12] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[4], pre[3][i]); + buf[60] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[52], pre[3][i]); + buf[13] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[12], pre[0][i]); + buf[61] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[60], pre[0][i]); + buf[15] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[13], pre[1][i]); + buf[63] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[61], pre[1][i]); + buf[14] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[15], pre[0][i]); + buf[62] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[63], pre[0][i]); + buf[10] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[14], pre[2][i]); + buf[58] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[62], pre[2][i]); + buf[11] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[10], pre[0][i]); + buf[59] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[58], pre[0][i]); + buf[9] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[11], pre[1][i]); + buf[57] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[59], pre[1][i]); + buf[56] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[57], pre[0][i]); + buf[8] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[0], pre[3][i]); + buf[40] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[56], pre[4][i]); + buf[24] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[8], pre[4][i]); + buf[41] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[40], pre[0][i]); + buf[25] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[24], pre[0][i]); + buf[43] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[41], pre[1][i]); + buf[27] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[25], pre[1][i]); + buf[42] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[43], pre[0][i]); + buf[26] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[27], pre[0][i]); + buf[46] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[42], pre[2][i]); + buf[30] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[26], pre[2][i]); + buf[47] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[46], pre[0][i]); + buf[31] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[30], pre[0][i]); + buf[45] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[47], pre[1][i]); + buf[29] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[31], pre[1][i]); + buf[44] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[45], pre[0][i]); + buf[28] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[29], pre[0][i]); + buf[36] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[44], pre[3][i]); + buf[20] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[28], pre[3][i]); + buf[37] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[36], pre[0][i]); + buf[21] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[20], pre[0][i]); + buf[39] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[37], pre[1][i]); + buf[23] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[21], pre[1][i]); + buf[38] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[39], pre[0][i]); + buf[22] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[23], pre[0][i]); + buf[34] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[38], pre[2][i]); + buf[18] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[22], pre[2][i]); + buf[35] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[34], pre[0][i]); + buf[19] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[18], pre[0][i]); + buf[33] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[35], pre[1][i]); + buf[17] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[19], pre[1][i]); + buf[32] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[33], pre[0][i]); + + PQCLEAN_MCELIECE8192128F_SSE_transpose_64x128_sp(buf); + + for (j = 0; j < 64; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 0; i <= 5; i++) { + s = 1 << i; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(tmp, out[k + s], (vec128 *) consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(out[k ][b], tmp[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(out[k + s][b], out[k][b]); + } + } + } + + consts_ptr += (1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 64; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(out[i][b], powers[i][b]); + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE8192128F_SSE_fft(vec128 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/crypto_kem/mceliece8192128f/sse/fft.h b/crypto_kem/mceliece8192128f/sse/fft.h new file mode 100644 index 00000000..dc46b5ef --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_FFT_H +#define PQCLEAN_MCELIECE8192128F_SSE_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include + +void PQCLEAN_MCELIECE8192128F_SSE_fft(vec128 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/sse/fft_tr.c b/crypto_kem/mceliece8192128f/sse/fft_tr.c new file mode 100644 index 00000000..4c7448ae --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/fft_tr.c @@ -0,0 +1,338 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec128 in[][ GFBITS ]) { + int i, j, k; + vec128 t, x0, x1; + + uint64_t v0, v1, v2, v3; + + const vec128 mask[6][2] = { + { + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec128 s[6][2][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(in[1], in[1], s[j][1]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(in[0][i], mask[k][0]); + t = PQCLEAN_MCELIECE8192128F_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(in[0][i], mask[k][1]); + t = PQCLEAN_MCELIECE8192128F_SSE_vec128_sll_2x(t, 1 << k); + in[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[0][i], t); + + t = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(in[1][i], mask[k][0]); + t = PQCLEAN_MCELIECE8192128F_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[1][i], t); + + t = PQCLEAN_MCELIECE8192128F_SSE_vec128_and(in[1][i], mask[k][1]); + t = PQCLEAN_MCELIECE8192128F_SSE_vec128_sll_2x(t, 1 << k); + in[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[1][i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + x0 = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_low(in[0][i], in[1][i]); + x1 = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_high(in[0][i], in[1][i]); + + x1 = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(x1, PQCLEAN_MCELIECE8192128F_SSE_vec128_srl_2x(x0, 32)); + x1 = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(x1, PQCLEAN_MCELIECE8192128F_SSE_vec128_sll_2x(x1, 32)); + + in[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_low(x0, x1); + in[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_high(x0, x1); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(in[0][i], 0); + v1 = PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(in[0][i], 1); + v2 = PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(in[1][i], 0); + v3 = PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(in[1][i], 1); + + v3 ^= v2 ^= v1; + + in[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(v0, v1); + in[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(v2, v3); + } + + } +} + +static void butterflies_tr(vec128 out[][ GFBITS ], vec128 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec128 tmp0[ GFBITS ]; + vec128 tmp1[ GFBITS ]; + vec128 tmp[ GFBITS ]; + + vec128 pre[ 6 ][ GFBITS ]; + vec128 buf[ 64 ]; + + const vec128 consts[ 64 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 64; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 5; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 64; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[k][b], in[k + s][b]); + } + + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[k + s][b], tmp[b]); + } + } + } + } + + for (j = 0; j < 64; j += 2) { + for (i = 0; i < GFBITS; i++) { + tmp0[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_low(in[j][i], in[j + 1][i]); + tmp1[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_high(in[j][i], in[j + 1][i]); + } + + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(tmp0[b], tmp1[b]); + } + + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(tmp, tmp0, consts[0]); + + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(tmp1[b], tmp[b]); + } + + for (i = 0; i < GFBITS; i++) { + in[j + 0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_low(tmp0[i], tmp1[i]); + in[j + 1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_high(tmp0[i], tmp1[i]); + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 64; k++) { + buf[k] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE8192128F_SSE_transpose_64x128_sp(buf); + + pre[0][i] = buf[32]; + buf[33] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[33], buf[32]); + pre[1][i] = buf[33]; + buf[35] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[35], buf[33]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[35]); + buf[34] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[34], buf[35]); + pre[2][i] = buf[34]; + buf[38] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[38], buf[34]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[38]); + buf[39] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[39], buf[38]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[39]); + buf[37] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[37], buf[39]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[37]); + buf[36] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[36], buf[37]); + pre[3][i] = buf[36]; + buf[44] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[44], buf[36]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[44]); + buf[45] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[45], buf[44]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[45]); + buf[47] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[47], buf[45]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[47]); + buf[46] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[46], buf[47]); + pre[2][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[2][i], buf[46]); + buf[42] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[42], buf[46]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[42]); + buf[43] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[43], buf[42]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[43]); + buf[41] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[41], buf[43]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[41]); + buf[40] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[40], buf[41]); + pre[4][i] = buf[40]; + buf[56] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[56], buf[40]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[56]); + buf[57] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[57], buf[56]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[57]); + buf[59] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[59], buf[57]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[59]); + buf[58] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[58], buf[59]); + pre[2][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[2][i], buf[58]); + buf[62] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[62], buf[58]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[62]); + buf[63] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[63], buf[62]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[63]); + buf[61] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[61], buf[63]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[61]); + buf[60] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[60], buf[61]); + pre[3][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[3][i], buf[60]); + buf[52] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[52], buf[60]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[52]); + buf[53] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[53], buf[52]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[53]); + buf[55] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[55], buf[53]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[55]); + buf[54] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[54], buf[55]); + pre[2][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[2][i], buf[54]); + buf[50] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[50], buf[54]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[50]); + buf[51] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[51], buf[50]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[51]); + buf[49] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[49], buf[51]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[49]); + buf[48] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[48], buf[49]); + pre[5][i] = buf[48]; + buf[16] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[16], buf[48]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[16]); + buf[17] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[17], buf[16]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[17]); + buf[19] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[19], buf[17]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[19]); + buf[18] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[18], buf[19]); + pre[2][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[2][i], buf[18]); + buf[22] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[22], buf[18]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[22]); + buf[23] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[23], buf[22]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[23]); + buf[21] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[21], buf[23]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[21]); + buf[20] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[20], buf[21]); + pre[3][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[3][i], buf[20]); + buf[28] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[28], buf[20]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[28]); + buf[29] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[29], buf[28]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[29]); + buf[31] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[31], buf[29]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[31]); + buf[30] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[30], buf[31]); + pre[2][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[2][i], buf[30]); + buf[26] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[26], buf[30]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[26]); + buf[27] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[27], buf[26]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[27]); + buf[25] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[25], buf[27]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[25]); + buf[24] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[24], buf[25]); + pre[4][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[4][i], buf[24]); + buf[8] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[8], buf[24]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[8]); + buf[9] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[9], buf[8]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[9]); + buf[11] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[11], buf[9]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[11]); + buf[10] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[10], buf[11]); + pre[2][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[2][i], buf[10]); + buf[14] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[14], buf[10]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[14]); + buf[15] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[15], buf[14]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[15]); + buf[13] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[13], buf[15]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[13]); + buf[12] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[12], buf[13]); + pre[3][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[3][i], buf[12]); + buf[4] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[4], buf[12]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[4]); + buf[5] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[5], buf[4]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[5]); + buf[7] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[7], buf[5]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[7]); + buf[6] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[6], buf[7]); + pre[2][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[2][i], buf[6]); + buf[2] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[2], buf[6]); + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[2]); + buf[3] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[3], buf[2]); + pre[1][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[1][i], buf[3]); + buf[1] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[1], buf[3]); + + pre[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(pre[0][i], buf[1]); + out[0][i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(buf[0], buf[1]); + + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(out[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(tmp, pre[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out[1][b] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(out[1][b], tmp[b]); + } + } +} + +void PQCLEAN_MCELIECE8192128F_SSE_fft_tr(vec128 out[][GFBITS], vec128 in[][ GFBITS ]) { + butterflies_tr(out, in); + + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece8192128f/sse/fft_tr.h b/crypto_kem/mceliece8192128f/sse/fft_tr.h new file mode 100644 index 00000000..1f94e182 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_FFT_TR_H +#define PQCLEAN_MCELIECE8192128F_SSE_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE8192128F_SSE_fft_tr(vec128 /*out*/[][GFBITS], vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece8192128f/sse/gf.c b/crypto_kem/mceliece8192128f/sse/gf.c new file mode 100644 index 00000000..f7ca22d9 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128F_SSE_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE8192128F_SSE_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE8192128F_SSE_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE8192128F_SSE_gf_inv(gf in) { + return PQCLEAN_MCELIECE8192128F_SSE_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE8192128F_SSE_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE8192128F_SSE_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE8192128F_SSE_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE8192128F_SSE_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE8192128F_SSE_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE8192128F_SSE_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece8192128f/sse/gf.h b/crypto_kem/mceliece8192128f/sse/gf.h new file mode 100644 index 00000000..c698c95e --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/gf.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_GF_H +#define PQCLEAN_MCELIECE8192128F_SSE_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE8192128F_SSE_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE8192128F_SSE_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE8192128F_SSE_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE8192128F_SSE_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE8192128F_SSE_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128F_SSE_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/crypto_kem/mceliece8192128f/sse/operations.c b/crypto_kem/mceliece8192128f/sse/operations.c new file mode 100644 index 00000000..bf390a94 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE8192128F_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE8192128F_SSE_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128F_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE8192128F_SSE_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128F_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE8192128F_SSE_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE8192128F_SSE_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE8192128F_SSE_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE8192128F_SSE_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE8192128F_SSE_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE8192128F_SSE_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE8192128F_SSE_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE8192128F_SSE_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128f/sse/params.h b/crypto_kem/mceliece8192128f/sse/params.h new file mode 100644 index 00000000..80e3101b --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_PARAMS_H +#define PQCLEAN_MCELIECE8192128F_SSE_PARAMS_H + +#define GFBITS 13 +#define SYS_N 8192 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece8192128f/sse/pk_gen.c b/crypto_kem/mceliece8192128f/sse/pk_gen.c new file mode 100644 index 00000000..3201854f --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/pk_gen.c @@ -0,0 +1,342 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec128 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 128 + 0 * 64 + r] <<= 1; + out[i * 128 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 128 + 1 * 64 + r] <<= 1; + out[i * 128 + 1 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec128 out0[][GFBITS], vec128 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[2] = {0}; + + for (i = 0; i < 64; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(u[0], u[1]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(u[0], u[1]); + } + } +} + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ 128 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << 32 >> 32) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> 32 << 32) | (buf[j] >> 32); + } + } + + return 0; +} + + +#define NBLOCKS_H ((SYS_N + 127) / 128) +#define NBLOCKS_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE8192128F_SSE_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ 128 ]; + + uint64_t mask; + + vec128 irr_int[ GFBITS ]; + + vec128 consts[ 64 ][ GFBITS ]; + vec128 eval[ 64 ][ GFBITS ]; + vec128 prod[ 64 ][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE8192128F_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE8192128F_SSE_fft(eval, irr_int); + + PQCLEAN_MCELIECE8192128F_SSE_vec128_copy(prod[0], eval[0]); + + for (i = 1; i < 64; i++) { + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128F_SSE_vec128_inv(tmp, prod[63]); + + for (i = 62; i >= 0; i--) { + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128F_SSE_vec128_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE8192128F_SSE_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS_H; j++) { + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < 128; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < 128; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < 128; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = NBLOCKS_I; j < 128; j++) { + PQCLEAN_MCELIECE8192128F_SSE_store8(pk, mat[i][j]); + pk += 8; + } + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece8192128f/sse/pk_gen.h b/crypto_kem/mceliece8192128f/sse/pk_gen.h new file mode 100644 index 00000000..2acc1ef3 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_PK_GEN_H +#define PQCLEAN_MCELIECE8192128F_SSE_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE8192128F_SSE_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/sse/powers.inc b/crypto_kem/mceliece8192128f/sse/powers.inc new file mode 100644 index 00000000..7048cbb9 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/powers.inc @@ -0,0 +1,960 @@ +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +} diff --git a/crypto_kem/mceliece8192128f/sse/scalars_2x.inc b/crypto_kem/mceliece8192128f/sse/scalars_2x.inc new file mode 100644 index 00000000..f7f45386 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/crypto_kem/mceliece8192128f/sse/scalars_4x.inc b/crypto_kem/mceliece8192128f/sse/scalars_4x.inc new file mode 100644 index 00000000..6d31f3f3 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/scalars_4x.inc @@ -0,0 +1,181 @@ +{{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF), +}}, +{{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FF00000000FF, 0X0000FF000000FFFF), +}}, +{{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF), +}}, +{{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), +}}, +{{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), +}, +{ + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), +}} + diff --git a/crypto_kem/mceliece8192128f/sse/sk_gen.c b/crypto_kem/mceliece8192128f/sse/sk_gen.c new file mode 100644 index 00000000..2e773826 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE8192128F_SSE_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE8192128F_SSE_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE8192128F_SSE_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE8192128F_SSE_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE8192128F_SSE_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE8192128F_SSE_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE8192128F_SSE_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE8192128F_SSE_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128f/sse/sk_gen.h b/crypto_kem/mceliece8192128f/sse/sk_gen.h new file mode 100644 index 00000000..3c86665b --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_SK_GEN_H +#define PQCLEAN_MCELIECE8192128F_SSE_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE8192128F_SSE_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE8192128F_SSE_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/sse/syndrome_asm.S b/crypto_kem/mceliece8192128f/sse/syndrome_asm.S new file mode 100644 index 00000000..0b53f566 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/syndrome_asm.S @@ -0,0 +1,1449 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: reg128 p + +# qhasm: reg128 e + +# qhasm: reg128 s + +# qhasm: int64 buf_ptr + +# qhasm: stack128 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128F_SSE_syndrome_asm +.global PQCLEAN_MCELIECE8192128F_SSE_syndrome_asm +_PQCLEAN_MCELIECE8192128F_SSE_syndrome_asm: +PQCLEAN_MCELIECE8192128F_SSE_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 1357008 +# asm 1: add $1357008,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1664 +# asm 1: mov $1664,>row=int64#5 +# asm 2: mov $1664,>row=%r8 +mov $1664,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,s=reg128#1 +# asm 2: movdqu 0(s=%xmm0 +movdqu 0(%rsi),%xmm0 + +# qhasm: e = mem128[ input_2 + 208 ] +# asm 1: movdqu 208(e=reg128#2 +# asm 2: movdqu 208(e=%xmm1 +movdqu 208(%rdx),%xmm1 + +# qhasm: s &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 16(p=%xmm1 +movdqu 16(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 224 ] +# asm 1: movdqu 224(e=reg128#3 +# asm 2: movdqu 224(e=%xmm2 +movdqu 224(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 32(p=%xmm1 +movdqu 32(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 240 ] +# asm 1: movdqu 240(e=reg128#3 +# asm 2: movdqu 240(e=%xmm2 +movdqu 240(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 48(p=%xmm1 +movdqu 48(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 256 ] +# asm 1: movdqu 256(e=reg128#3 +# asm 2: movdqu 256(e=%xmm2 +movdqu 256(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 64(p=%xmm1 +movdqu 64(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 272 ] +# asm 1: movdqu 272(e=reg128#3 +# asm 2: movdqu 272(e=%xmm2 +movdqu 272(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 80(p=%xmm1 +movdqu 80(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 288 ] +# asm 1: movdqu 288(e=reg128#3 +# asm 2: movdqu 288(e=%xmm2 +movdqu 288(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 96(p=%xmm1 +movdqu 96(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 304 ] +# asm 1: movdqu 304(e=reg128#3 +# asm 2: movdqu 304(e=%xmm2 +movdqu 304(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 112(p=%xmm1 +movdqu 112(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 320 ] +# asm 1: movdqu 320(e=reg128#3 +# asm 2: movdqu 320(e=%xmm2 +movdqu 320(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 128(p=%xmm1 +movdqu 128(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 336 ] +# asm 1: movdqu 336(e=reg128#3 +# asm 2: movdqu 336(e=%xmm2 +movdqu 336(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 144(p=%xmm1 +movdqu 144(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 352 ] +# asm 1: movdqu 352(e=reg128#3 +# asm 2: movdqu 352(e=%xmm2 +movdqu 352(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 160(p=%xmm1 +movdqu 160(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 368 ] +# asm 1: movdqu 368(e=reg128#3 +# asm 2: movdqu 368(e=%xmm2 +movdqu 368(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 176(p=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 384 ] +# asm 1: movdqu 384(e=reg128#3 +# asm 2: movdqu 384(e=%xmm2 +movdqu 384(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 192(p=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 400 ] +# asm 1: movdqu 400(e=reg128#3 +# asm 2: movdqu 400(e=%xmm2 +movdqu 400(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 208(p=%xmm1 +movdqu 208(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 416 ] +# asm 1: movdqu 416(e=reg128#3 +# asm 2: movdqu 416(e=%xmm2 +movdqu 416(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 224(p=%xmm1 +movdqu 224(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 432 ] +# asm 1: movdqu 432(e=reg128#3 +# asm 2: movdqu 432(e=%xmm2 +movdqu 432(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 240(p=%xmm1 +movdqu 240(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 448 ] +# asm 1: movdqu 448(e=reg128#3 +# asm 2: movdqu 448(e=%xmm2 +movdqu 448(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 256(p=%xmm1 +movdqu 256(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 464 ] +# asm 1: movdqu 464(e=reg128#3 +# asm 2: movdqu 464(e=%xmm2 +movdqu 464(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 272(p=%xmm1 +movdqu 272(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 480 ] +# asm 1: movdqu 480(e=reg128#3 +# asm 2: movdqu 480(e=%xmm2 +movdqu 480(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 288(p=%xmm1 +movdqu 288(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 496 ] +# asm 1: movdqu 496(e=reg128#3 +# asm 2: movdqu 496(e=%xmm2 +movdqu 496(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 304(p=%xmm1 +movdqu 304(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 512 ] +# asm 1: movdqu 512(e=reg128#3 +# asm 2: movdqu 512(e=%xmm2 +movdqu 512(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 320(p=%xmm1 +movdqu 320(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 528 ] +# asm 1: movdqu 528(e=reg128#3 +# asm 2: movdqu 528(e=%xmm2 +movdqu 528(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 336(p=%xmm1 +movdqu 336(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 544 ] +# asm 1: movdqu 544(e=reg128#3 +# asm 2: movdqu 544(e=%xmm2 +movdqu 544(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 352(p=%xmm1 +movdqu 352(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 560 ] +# asm 1: movdqu 560(e=reg128#3 +# asm 2: movdqu 560(e=%xmm2 +movdqu 560(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 368(p=%xmm1 +movdqu 368(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 576 ] +# asm 1: movdqu 576(e=reg128#3 +# asm 2: movdqu 576(e=%xmm2 +movdqu 576(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 384(p=%xmm1 +movdqu 384(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 592 ] +# asm 1: movdqu 592(e=reg128#3 +# asm 2: movdqu 592(e=%xmm2 +movdqu 592(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 400(p=%xmm1 +movdqu 400(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 608 ] +# asm 1: movdqu 608(e=reg128#3 +# asm 2: movdqu 608(e=%xmm2 +movdqu 608(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 416(p=%xmm1 +movdqu 416(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 624 ] +# asm 1: movdqu 624(e=reg128#3 +# asm 2: movdqu 624(e=%xmm2 +movdqu 624(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 432(p=%xmm1 +movdqu 432(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 640 ] +# asm 1: movdqu 640(e=reg128#3 +# asm 2: movdqu 640(e=%xmm2 +movdqu 640(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 448(p=%xmm1 +movdqu 448(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 656 ] +# asm 1: movdqu 656(e=reg128#3 +# asm 2: movdqu 656(e=%xmm2 +movdqu 656(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 464(p=%xmm1 +movdqu 464(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 672 ] +# asm 1: movdqu 672(e=reg128#3 +# asm 2: movdqu 672(e=%xmm2 +movdqu 672(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 480(p=%xmm1 +movdqu 480(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 688 ] +# asm 1: movdqu 688(e=reg128#3 +# asm 2: movdqu 688(e=%xmm2 +movdqu 688(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 496(p=%xmm1 +movdqu 496(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 704 ] +# asm 1: movdqu 704(e=reg128#3 +# asm 2: movdqu 704(e=%xmm2 +movdqu 704(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 512(p=%xmm1 +movdqu 512(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 720 ] +# asm 1: movdqu 720(e=reg128#3 +# asm 2: movdqu 720(e=%xmm2 +movdqu 720(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 528(p=%xmm1 +movdqu 528(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 736 ] +# asm 1: movdqu 736(e=reg128#3 +# asm 2: movdqu 736(e=%xmm2 +movdqu 736(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 544(p=%xmm1 +movdqu 544(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 752 ] +# asm 1: movdqu 752(e=reg128#3 +# asm 2: movdqu 752(e=%xmm2 +movdqu 752(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 560(p=%xmm1 +movdqu 560(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 768 ] +# asm 1: movdqu 768(e=reg128#3 +# asm 2: movdqu 768(e=%xmm2 +movdqu 768(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 576(p=%xmm1 +movdqu 576(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 784 ] +# asm 1: movdqu 784(e=reg128#3 +# asm 2: movdqu 784(e=%xmm2 +movdqu 784(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 592(p=%xmm1 +movdqu 592(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 800 ] +# asm 1: movdqu 800(e=reg128#3 +# asm 2: movdqu 800(e=%xmm2 +movdqu 800(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 608(p=%xmm1 +movdqu 608(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 816 ] +# asm 1: movdqu 816(e=reg128#3 +# asm 2: movdqu 816(e=%xmm2 +movdqu 816(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 624(p=%xmm1 +movdqu 624(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 832 ] +# asm 1: movdqu 832(e=reg128#3 +# asm 2: movdqu 832(e=%xmm2 +movdqu 832(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 640(p=%xmm1 +movdqu 640(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 848 ] +# asm 1: movdqu 848(e=reg128#3 +# asm 2: movdqu 848(e=%xmm2 +movdqu 848(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 656(p=%xmm1 +movdqu 656(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 864 ] +# asm 1: movdqu 864(e=reg128#3 +# asm 2: movdqu 864(e=%xmm2 +movdqu 864(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 672(p=%xmm1 +movdqu 672(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 880 ] +# asm 1: movdqu 880(e=reg128#3 +# asm 2: movdqu 880(e=%xmm2 +movdqu 880(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 688(p=%xmm1 +movdqu 688(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 896 ] +# asm 1: movdqu 896(e=reg128#3 +# asm 2: movdqu 896(e=%xmm2 +movdqu 896(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 704(p=%xmm1 +movdqu 704(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 912 ] +# asm 1: movdqu 912(e=reg128#3 +# asm 2: movdqu 912(e=%xmm2 +movdqu 912(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 720(p=%xmm1 +movdqu 720(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 928 ] +# asm 1: movdqu 928(e=reg128#3 +# asm 2: movdqu 928(e=%xmm2 +movdqu 928(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 736(p=%xmm1 +movdqu 736(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 944 ] +# asm 1: movdqu 944(e=reg128#3 +# asm 2: movdqu 944(e=%xmm2 +movdqu 944(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 752(p=%xmm1 +movdqu 752(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 960 ] +# asm 1: movdqu 960(e=reg128#3 +# asm 2: movdqu 960(e=%xmm2 +movdqu 960(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 768(p=%xmm1 +movdqu 768(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 976 ] +# asm 1: movdqu 976(e=reg128#3 +# asm 2: movdqu 976(e=%xmm2 +movdqu 976(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 784(p=%xmm1 +movdqu 784(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 992 ] +# asm 1: movdqu 992(e=reg128#3 +# asm 2: movdqu 992(e=%xmm2 +movdqu 992(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand p=reg128#2 +# asm 2: movdqu 800(p=%xmm1 +movdqu 800(%rsi),%xmm1 + +# qhasm: e = mem128[ input_2 + 1008 ] +# asm 1: movdqu 1008(e=reg128#3 +# asm 2: movdqu 1008(e=%xmm2 +movdqu 1008(%rdx),%xmm2 + +# qhasm: p &= e +# asm 1: pand buf=stack128#1 +# asm 2: movdqa buf=0(%rsp) +movdqa %xmm0,0(%rsp) + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#6 +# asm 2: movq 0(b64=%r9 +movq 0(%rcx),%r9 + +# qhasm: c_all = count(b64) +# asm 1: popcnt c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 8 ] +# asm 1: movq 8(b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,s=reg128#1 +# asm 2: movdqu 0(s=%xmm0 +movdqu 0(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(e=reg128#2 +# asm 2: movdqu 0(e=%xmm1 +movdqu 0(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 16(s=%xmm0 +movdqu 16(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 16 ] +# asm 1: movdqu 16(e=reg128#2 +# asm 2: movdqu 16(e=%xmm1 +movdqu 16(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 32(s=%xmm0 +movdqu 32(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 32 ] +# asm 1: movdqu 32(e=reg128#2 +# asm 2: movdqu 32(e=%xmm1 +movdqu 32(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 48(s=%xmm0 +movdqu 48(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 48 ] +# asm 1: movdqu 48(e=reg128#2 +# asm 2: movdqu 48(e=%xmm1 +movdqu 48(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 64(s=%xmm0 +movdqu 64(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 64 ] +# asm 1: movdqu 64(e=reg128#2 +# asm 2: movdqu 64(e=%xmm1 +movdqu 64(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 80(s=%xmm0 +movdqu 80(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 80 ] +# asm 1: movdqu 80(e=reg128#2 +# asm 2: movdqu 80(e=%xmm1 +movdqu 80(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 96(s=%xmm0 +movdqu 96(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 96 ] +# asm 1: movdqu 96(e=reg128#2 +# asm 2: movdqu 96(e=%xmm1 +movdqu 96(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 112(s=%xmm0 +movdqu 112(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 112 ] +# asm 1: movdqu 112(e=reg128#2 +# asm 2: movdqu 112(e=%xmm1 +movdqu 112(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 128(s=%xmm0 +movdqu 128(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 128 ] +# asm 1: movdqu 128(e=reg128#2 +# asm 2: movdqu 128(e=%xmm1 +movdqu 128(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 144(s=%xmm0 +movdqu 144(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 144 ] +# asm 1: movdqu 144(e=reg128#2 +# asm 2: movdqu 144(e=%xmm1 +movdqu 144(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 160(s=%xmm0 +movdqu 160(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 160 ] +# asm 1: movdqu 160(e=reg128#2 +# asm 2: movdqu 160(e=%xmm1 +movdqu 160(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 176(s=%xmm0 +movdqu 176(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 176 ] +# asm 1: movdqu 176(e=reg128#2 +# asm 2: movdqu 176(e=%xmm1 +movdqu 176(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor s=reg128#1 +# asm 2: movdqu 192(s=%xmm0 +movdqu 192(%rdi),%xmm0 + +# qhasm: e = mem128[ input_2 + 192 ] +# asm 1: movdqu 192(e=reg128#2 +# asm 2: movdqu 192(e=%xmm1 +movdqu 192(%rdx),%xmm1 + +# qhasm: s ^= e +# asm 1: pxor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE8192128F_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE8192128F_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE8192128F_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE8192128F_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE8192128F_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE8192128F_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE8192128F_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE8192128F_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE8192128F_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE8192128F_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE8192128F_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#3 +# asm 2: movq 0(s0=%rdx +movq 0(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#4 +# asm 2: movq 8(s1=%rcx +movq 8(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 16(s0=%rdx +movq 16(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(s1=int64#4 +# asm 2: movq 24(s1=%rcx +movq 24(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 32(s0=%rdx +movq 32(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(s1=int64#4 +# asm 2: movq 40(s1=%rcx +movq 40(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 48(s0=%rdx +movq 48(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(s1=int64#4 +# asm 2: movq 56(s1=%rcx +movq 56(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 64(s0=%rdx +movq 64(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(s1=int64#4 +# asm 2: movq 72(s1=%rcx +movq 72(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 80(s0=%rdx +movq 80(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(s1=int64#4 +# asm 2: movq 88(s1=%rcx +movq 88(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 96(s0=%rdx +movq 96(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(s1=int64#4 +# asm 2: movq 104(s1=%rcx +movq 104(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 112(s0=%rdx +movq 112(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(s1=int64#4 +# asm 2: movq 120(s1=%rcx +movq 120(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 128(s0=%rdx +movq 128(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(s1=int64#4 +# asm 2: movq 136(s1=%rcx +movq 136(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 144(s0=%rdx +movq 144(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(s1=int64#4 +# asm 2: movq 152(s1=%rcx +movq 152(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 160(s0=%rdx +movq 160(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(s1=int64#4 +# asm 2: movq 168(s1=%rcx +movq 168(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 176(s0=%rdx +movq 176(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(s1=int64#4 +# asm 2: movq 184(s1=%rcx +movq 184(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#3 +# asm 2: movq 192(s0=%rdx +movq 192(%rdi),%rdx + +# qhasm: s1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(s1=int64#4 +# asm 2: movq 200(s1=%rcx +movq 200(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE8192128F_SSE_store_i(unsigned char *out, uint64_t in, int i) { + for (int j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE8192128F_SSE_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE8192128F_SSE_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE8192128F_SSE_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE8192128F_SSE_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v0 = 0, v1 = 0; + uint16_t irr[ SYS_T ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE8192128F_SSE_load2(in + i * 2); + irr[i] &= GFMASK; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 63; j >= 0; j--) { + v0 <<= 1; + v1 <<= 1; + v0 |= (irr[j] >> i) & 1; + v1 |= (irr[j + 64] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(v0, v1); + } +} + +void PQCLEAN_MCELIECE8192128F_SSE_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE8192128F_SSE_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE8192128F_SSE_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x( PQCLEAN_MCELIECE8192128F_SSE_load8(in), PQCLEAN_MCELIECE8192128F_SSE_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE8192128F_SSE_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE8192128F_SSE_store8(out + 0, PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(in, 0)); + PQCLEAN_MCELIECE8192128F_SSE_store8(out + 8, PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(in, 1)); +} diff --git a/crypto_kem/mceliece8192128f/sse/util.h b/crypto_kem/mceliece8192128f/sse/util.h new file mode 100644 index 00000000..eacc4198 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_UTIL_H +#define PQCLEAN_MCELIECE8192128F_SSE_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE8192128F_SSE_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE8192128F_SSE_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE8192128F_SSE_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE8192128F_SSE_load4(const unsigned char *src); +void PQCLEAN_MCELIECE8192128F_SSE_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE8192128F_SSE_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE8192128F_SSE_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE8192128F_SSE_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE8192128F_SSE_store16(unsigned char *out, vec128 in); + +#endif diff --git a/crypto_kem/mceliece8192128f/sse/vec128.c b/crypto_kem/mceliece8192128f/sse/vec128.c new file mode 100644 index 00000000..2ba0c02f --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/vec128.c @@ -0,0 +1,152 @@ +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + +#include "vec128.h" + +#include "params.h" + + +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE8192128F_SSE_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE8192128F_SSE_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE8192128F_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE8192128F_SSE_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul_asm(h, f, g, 16); +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(vec128 *out, const vec128 *in) { + int i; + vec128 result[GFBITS], t; + + t = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[11], in[12]); + + result[0] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[0], in[11]); + result[1] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[7], t); + result[2] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[1], in[7]); + result[3] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[8], t); + result[4] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[2], in[7]); + result[4] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(result[4], in[8]); + result[4] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(result[4], t); + result[5] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[7], in[9]); + result[6] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[3], in[8]); + result[6] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(result[6], in[9]); + result[6] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(result[6], in[12]); + result[7] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[8], in[10]); + result[8] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[4], in[9]); + result[8] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(result[8], in[10]); + result[9] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[9], in[11]); + result[10] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[5], in[10]); + result[10] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(result[10], in[11]); + result[11] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[10], in[12]); + result[12] = PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(in[6], t); + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE8192128F_SSE_vec128_inv(vec128 *out, const vec128 *in) { + vec128 tmp_11[ GFBITS ]; + vec128 tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE8192128F_SSE_vec128_copy(out, in); + + PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(out, tmp_11); + PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(out, tmp_1111); + PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece8192128f/sse/vec128.h b/crypto_kem/mceliece8192128f/sse/vec128.h new file mode 100644 index 00000000..4e9f4941 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/vec128.h @@ -0,0 +1,43 @@ +#ifndef PQCLEAN_MCELIECE8192128F_SSE_VEC128_H +#define PQCLEAN_MCELIECE8192128F_SSE_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE8192128F_SSE_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE8192128F_SSE_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE8192128F_SSE_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE8192128F_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE8192128F_SSE_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE8192128F_SSE_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128F_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +void PQCLEAN_MCELIECE8192128F_SSE_vec128_sq(vec128 *out, const vec128 *in); +void PQCLEAN_MCELIECE8192128F_SSE_vec128_inv(vec128 *out, const vec128 *in); +#endif diff --git a/crypto_kem/mceliece8192128f/sse/vec128_mul_asm.S b/crypto_kem/mceliece8192128f/sse/vec128_mul_asm.S new file mode 100644 index 00000000..81eed5e7 --- /dev/null +++ b/crypto_kem/mceliece8192128f/sse/vec128_mul_asm.S @@ -0,0 +1,2127 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 a0 + +# qhasm: reg128 a1 + +# qhasm: reg128 a2 + +# qhasm: reg128 a3 + +# qhasm: reg128 a4 + +# qhasm: reg128 a5 + +# qhasm: reg128 a6 + +# qhasm: reg128 a7 + +# qhasm: reg128 a8 + +# qhasm: reg128 a9 + +# qhasm: reg128 a10 + +# qhasm: reg128 a11 + +# qhasm: reg128 a12 + +# qhasm: reg128 b0 + +# qhasm: reg128 b1 + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 r5 + +# qhasm: reg128 r6 + +# qhasm: reg128 r7 + +# qhasm: reg128 r8 + +# qhasm: reg128 r9 + +# qhasm: reg128 r10 + +# qhasm: reg128 r11 + +# qhasm: reg128 r12 + +# qhasm: reg128 r13 + +# qhasm: reg128 r14 + +# qhasm: reg128 r15 + +# qhasm: reg128 r16 + +# qhasm: reg128 r17 + +# qhasm: reg128 r18 + +# qhasm: reg128 r19 + +# qhasm: reg128 r20 + +# qhasm: reg128 r21 + +# qhasm: reg128 r22 + +# qhasm: reg128 r23 + +# qhasm: reg128 r24 + +# qhasm: reg128 r + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128F_SSE_vec128_mul_asm +.global PQCLEAN_MCELIECE8192128F_SSE_vec128_mul_asm +_PQCLEAN_MCELIECE8192128F_SSE_vec128_mul_asm: +PQCLEAN_MCELIECE8192128F_SSE_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(b0=reg128#1 +# asm 2: movdqu 0(b0=%xmm0 +movdqu 0(%rdx),%xmm0 + +# qhasm: a12 = mem128[ input_1 + 192 ] +# asm 1: movdqu 192(a12=reg128#2 +# asm 2: movdqu 192(a12=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg128#3 +# asm 2: vpand r12=%xmm2 +vpand %xmm0,%xmm1,%xmm2 + +# qhasm: r13 = a12 & mem128[input_2 + 16] +# asm 1: vpand 16(r13=reg128#4 +# asm 2: vpand 16(r13=%xmm3 +vpand 16(%rdx),%xmm1,%xmm3 + +# qhasm: r14 = a12 & mem128[input_2 + 32] +# asm 1: vpand 32(r14=reg128#5 +# asm 2: vpand 32(r14=%xmm4 +vpand 32(%rdx),%xmm1,%xmm4 + +# qhasm: r15 = a12 & mem128[input_2 + 48] +# asm 1: vpand 48(r15=reg128#6 +# asm 2: vpand 48(r15=%xmm5 +vpand 48(%rdx),%xmm1,%xmm5 + +# qhasm: r16 = a12 & mem128[input_2 + 64] +# asm 1: vpand 64(r16=reg128#7 +# asm 2: vpand 64(r16=%xmm6 +vpand 64(%rdx),%xmm1,%xmm6 + +# qhasm: r17 = a12 & mem128[input_2 + 80] +# asm 1: vpand 80(r17=reg128#8 +# asm 2: vpand 80(r17=%xmm7 +vpand 80(%rdx),%xmm1,%xmm7 + +# qhasm: r18 = a12 & mem128[input_2 + 96] +# asm 1: vpand 96(r18=reg128#9 +# asm 2: vpand 96(r18=%xmm8 +vpand 96(%rdx),%xmm1,%xmm8 + +# qhasm: r19 = a12 & mem128[input_2 + 112] +# asm 1: vpand 112(r19=reg128#10 +# asm 2: vpand 112(r19=%xmm9 +vpand 112(%rdx),%xmm1,%xmm9 + +# qhasm: r20 = a12 & mem128[input_2 + 128] +# asm 1: vpand 128(r20=reg128#11 +# asm 2: vpand 128(r20=%xmm10 +vpand 128(%rdx),%xmm1,%xmm10 + +# qhasm: r21 = a12 & mem128[input_2 + 144] +# asm 1: vpand 144(r21=reg128#12 +# asm 2: vpand 144(r21=%xmm11 +vpand 144(%rdx),%xmm1,%xmm11 + +# qhasm: r22 = a12 & mem128[input_2 + 160] +# asm 1: vpand 160(r22=reg128#13 +# asm 2: vpand 160(r22=%xmm12 +vpand 160(%rdx),%xmm1,%xmm12 + +# qhasm: r23 = a12 & mem128[input_2 + 176] +# asm 1: vpand 176(r23=reg128#14 +# asm 2: vpand 176(r23=%xmm13 +vpand 176(%rdx),%xmm1,%xmm13 + +# qhasm: r24 = a12 & mem128[input_2 + 192] +# asm 1: vpand 192(r24=reg128#2 +# asm 2: vpand 192(r24=%xmm1 +vpand 192(%rdx),%xmm1,%xmm1 + +# qhasm: r15 ^= r24 +# asm 1: pxor r11=reg128#2 +# asm 2: movdqa r11=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: a11 = mem128[ input_1 + 176 ] +# asm 1: movdqu 176(a11=reg128#15 +# asm 2: movdqu 176(a11=%xmm14 +movdqu 176(%rsi),%xmm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r22 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r23 ^= r +# asm 1: pxor r10=reg128#14 +# asm 2: movdqa r10=%xmm13 +movdqa %xmm13,%xmm13 + +# qhasm: a10 = mem128[ input_1 + 160 ] +# asm 1: movdqu 160(a10=reg128#15 +# asm 2: movdqu 160(a10=%xmm14 +movdqu 160(%rsi),%xmm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r21 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r22 ^= r +# asm 1: pxor r9=reg128#13 +# asm 2: movdqa r9=%xmm12 +movdqa %xmm12,%xmm12 + +# qhasm: a9 = mem128[ input_1 + 144 ] +# asm 1: movdqu 144(a9=reg128#15 +# asm 2: movdqu 144(a9=%xmm14 +movdqu 144(%rsi),%xmm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r21 ^= r +# asm 1: pxor r8=reg128#12 +# asm 2: movdqa r8=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: a8 = mem128[ input_1 + 128 ] +# asm 1: movdqu 128(a8=reg128#15 +# asm 2: movdqu 128(a8=%xmm14 +movdqu 128(%rsi),%xmm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r20 ^= r +# asm 1: pxor r7=reg128#11 +# asm 2: movdqa r7=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: a7 = mem128[ input_1 + 112 ] +# asm 1: movdqu 112(a7=reg128#15 +# asm 2: movdqu 112(a7=%xmm14 +movdqu 112(%rsi),%xmm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r6=reg128#10 +# asm 2: movdqa r6=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: a6 = mem128[ input_1 + 96 ] +# asm 1: movdqu 96(a6=reg128#15 +# asm 2: movdqu 96(a6=%xmm14 +movdqu 96(%rsi),%xmm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r5=reg128#9 +# asm 2: movdqa r5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: a5 = mem128[ input_1 + 80 ] +# asm 1: movdqu 80(a5=reg128#15 +# asm 2: movdqu 80(a5=%xmm14 +movdqu 80(%rsi),%xmm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r4=reg128#8 +# asm 2: movdqa r4=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: a4 = mem128[ input_1 + 64 ] +# asm 1: movdqu 64(a4=reg128#15 +# asm 2: movdqu 64(a4=%xmm14 +movdqu 64(%rsi),%xmm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r3=reg128#7 +# asm 2: movdqa r3=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: a3 = mem128[ input_1 + 48 ] +# asm 1: movdqu 48(a3=reg128#15 +# asm 2: movdqu 48(a3=%xmm14 +movdqu 48(%rsi),%xmm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r2=reg128#6 +# asm 2: movdqa r2=%xmm5 +movdqa %xmm5,%xmm5 + +# qhasm: a2 = mem128[ input_1 + 32 ] +# asm 1: movdqu 32(a2=reg128#15 +# asm 2: movdqu 32(a2=%xmm14 +movdqu 32(%rsi),%xmm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r1=reg128#5 +# asm 2: movdqa r1=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: a1 = mem128[ input_1 + 16 ] +# asm 1: movdqu 16(a1=reg128#15 +# asm 2: movdqu 16(a1=%xmm14 +movdqu 16(%rsi),%xmm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg128#16 +# asm 2: vpand r=%xmm15 +vpand %xmm0,%xmm14,%xmm15 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 16(r=%xmm15 +vpand 16(%rdx),%xmm14,%xmm15 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 32(r=%xmm15 +vpand 32(%rdx),%xmm14,%xmm15 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 48(r=%xmm15 +vpand 48(%rdx),%xmm14,%xmm15 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 64(r=%xmm15 +vpand 64(%rdx),%xmm14,%xmm15 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 80(r=%xmm15 +vpand 80(%rdx),%xmm14,%xmm15 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 96(r=%xmm15 +vpand 96(%rdx),%xmm14,%xmm15 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 112(r=%xmm15 +vpand 112(%rdx),%xmm14,%xmm15 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 128(r=%xmm15 +vpand 128(%rdx),%xmm14,%xmm15 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 144(r=%xmm15 +vpand 144(%rdx),%xmm14,%xmm15 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 160(r=%xmm15 +vpand 160(%rdx),%xmm14,%xmm15 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#16 +# asm 2: vpand 176(r=%xmm15 +vpand 176(%rdx),%xmm14,%xmm15 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 192(r=%xmm14 +vpand 192(%rdx),%xmm14,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r0=reg128#4 +# asm 2: movdqa r0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: a0 = mem128[ input_1 + 0 ] +# asm 1: movdqu 0(a0=reg128#15 +# asm 2: movdqu 0(a0=%xmm14 +movdqu 0(%rsi),%xmm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm0,%xmm14,%xmm0 + +# qhasm: r0 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 16(r=%xmm0 +vpand 16(%rdx),%xmm14,%xmm0 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 32(r=%xmm0 +vpand 32(%rdx),%xmm14,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 48(r=%xmm0 +vpand 48(%rdx),%xmm14,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 64(r=%xmm0 +vpand 64(%rdx),%xmm14,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 80(r=%xmm0 +vpand 80(%rdx),%xmm14,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 96(r=%xmm0 +vpand 96(%rdx),%xmm14,%xmm0 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 112(r=%xmm0 +vpand 112(%rdx),%xmm14,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 128(r=%xmm0 +vpand 128(%rdx),%xmm14,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 144(r=%xmm0 +vpand 144(%rdx),%xmm14,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 160(r=%xmm0 +vpand 160(%rdx),%xmm14,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 176(r=%xmm0 +vpand 176(%rdx),%xmm14,%xmm0 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 192(r=%xmm0 +vpand 192(%rdx),%xmm14,%xmm0 + +# qhasm: r12 ^= r +# asm 1: pxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE8192128F_VEC_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/crypto_kem/mceliece8192128f/vec/api.h b/crypto_kem/mceliece8192128f/vec/api.h new file mode 100644 index 00000000..93dbda5d --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_API_H +#define PQCLEAN_MCELIECE8192128F_VEC_API_H + +#include + +#define PQCLEAN_MCELIECE8192128F_VEC_CRYPTO_ALGNAME "Classic McEliece 8192128f" +#define PQCLEAN_MCELIECE8192128F_VEC_CRYPTO_PUBLICKEYBYTES 1357824 +#define PQCLEAN_MCELIECE8192128F_VEC_CRYPTO_SECRETKEYBYTES 14080 +#define PQCLEAN_MCELIECE8192128F_VEC_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE8192128F_VEC_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/benes.c b/crypto_kem/mceliece8192128f/vec/benes.c new file mode 100644 index 00000000..1c4d60bb --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/benes.c @@ -0,0 +1,147 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" +#include "vec.h" + +/* middle layers of the benes network */ +static void layer_in(uint64_t data[2][64], uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[0][j + 0] ^ data[0][j + s]); + d &= (*bits++); + data[0][j + 0] ^= d; + data[0][j + s] ^= d; + + d = (data[1][j + 0] ^ data[1][j + s]); + d &= (*bits++); + data[1][j + 0] ^= d; + data[1][j + s] ^= d; + } + } +} + +/* first and last layers of the benes network */ +static void layer_ex(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 128; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE8192128F_VEC_benes(vec *r, const unsigned char *bits, int rev) { + int i, iter, inc; + + const unsigned char *bits_ptr; + + uint64_t r_int_v[2][64]; + uint64_t r_int_h[2][64]; + uint64_t b_int_v[64]; + uint64_t b_int_h[64]; + + // + + if (rev) { + bits_ptr = bits + 12288; + inc = -1024; + } else { + bits_ptr = bits; + inc = 0; + } + + for (i = 0; i < 64; i++) { + r_int_v[0][i] = r[i * 2 + 0]; + r_int_v[1][i] = r[i * 2 + 1]; + } + + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 0; iter <= 6; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (iter = 0; iter <= 5; iter++) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + for (iter = 4; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + bits_ptr += inc; + + layer_in(r_int_v, b_int_v, iter); + } + + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(r_int_h[0], r_int_v[0]); + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(r_int_h[1], r_int_v[1]); + + for (iter = 6; iter >= 0; iter--) { + for (i = 0; i < 64; i++) { + b_int_v[i] = PQCLEAN_MCELIECE8192128F_VEC_load8(bits_ptr); + bits_ptr += 8; + } + + bits_ptr += inc; + + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(b_int_h, b_int_v); + + layer_ex(r_int_h[0], b_int_h, iter); + } + + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(r_int_v[0], r_int_h[0]); + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(r_int_v[1], r_int_h[1]); + + for (i = 0; i < 64; i++) { + r[i * 2 + 0] = r_int_v[0][i]; + r[i * 2 + 1] = r_int_v[1][i]; + } +} + diff --git a/crypto_kem/mceliece8192128f/vec/benes.h b/crypto_kem/mceliece8192128f/vec/benes.h new file mode 100644 index 00000000..a89ce00d --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/benes.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_BENES_H +#define PQCLEAN_MCELIECE8192128F_VEC_BENES_H +/* + This file is for Benes network related functions +*/ + +#include + +void PQCLEAN_MCELIECE8192128F_VEC_benes(uint64_t * /*r*/, const unsigned char * /*bits*/, int /*rev*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/bm.c b/crypto_kem/mceliece8192128f/vec/bm.c new file mode 100644 index 00000000..60278cd7 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/bm.c @@ -0,0 +1,245 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return (uint16_t)ret; +} + +static inline void vec_cmov(vec *out, const vec *in, uint16_t mask) { + int i; + + vec m0, m1; + + m0 = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(mask); + m1 = ~m0; + + for (i = 0; i < GFBITS; i++) { + out[i] = (in[i] & m0) | (out[i] & m1); + out[i] = (in[i] & m0) | (out[i] & m1); + } +} + +static inline void interleave(vec *in, int idx0, int idx1, const vec *mask, int b) { + int s = 1 << b; + + vec x, y; + + x = (in[idx0] & mask[0]) | ((in[idx1] & mask[0]) << s); + y = ((in[idx0] & mask[1]) >> s) | (in[idx1] & mask[1]); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, const vec *in) { + int i, k; + + vec mask[4][2]; + vec buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = 0; + } + + mask[0][0] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ k * 16 + i ] = (buf[i] >> (k * 16)) & GFMASK; + } + } +} + +static void update(vec in[][GFBITS], const gf e) { + int i; + vec tmp; + + for (i = 0; i < GFBITS; i++) { + tmp = (e >> i) & 1; + + in[0][i] = (in[0][i] >> 1) | (in[1][i] << 63); + in[1][i] = (in[1][i] >> 1) | (tmp << 63); + } +} + +static inline gf vec_reduce(vec in[][GFBITS]) { + int i; + vec tmp; + gf ret = 0; + + for (i = GFBITS - 1; i >= 0; i--) { + tmp = in[0][i] ^ in[1][i]; + + tmp ^= tmp >> 32; + tmp ^= tmp >> 16; + tmp ^= tmp >> 8; + tmp ^= tmp >> 4; + tmp ^= tmp >> 2; + tmp ^= tmp >> 1; + + ret <<= 1; + ret |= tmp & 1; + } + + return ret; +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE8192128F_VEC_bm(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec prod[2][GFBITS]; + vec interval[2][GFBITS]; + vec dd[2][GFBITS], bb[2][GFBITS]; + vec B[2][GFBITS], C[2][GFBITS]; + vec B_tmp[2][GFBITS], C_tmp[2][GFBITS]; + vec v[GFBITS]; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(&coefs[ 0], in[0]); + get_coefs(&coefs[ 64], in[1]); + get_coefs(&coefs[128], in[2]); + get_coefs(&coefs[192], in[3]); + + C[0][0] = 0; + C[1][0] = 0; + B[0][0] = 0; + B[1][0] = one << 63; + + for (i = 1; i < GFBITS; i++) { + C[0][i] = C[1][i] = B[0][i] = B[1][i] = 0; + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[0][i] = interval[1][i] = 0; + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(prod[0], C[0], interval[0]); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(prod[1], C[1], interval[1]); + update(interval, coefs[N]); + d = vec_reduce(prod); + + t = PQCLEAN_MCELIECE8192128F_VEC_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + dd[0][i] = dd[1][i] = PQCLEAN_MCELIECE8192128F_VEC_vec_setbits((d >> i) & 1); + bb[0][i] = bb[1][i] = PQCLEAN_MCELIECE8192128F_VEC_vec_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(B_tmp[0], dd[0], B[0]); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(B_tmp[1], dd[1], B[1]); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(C_tmp[0], bb[0], C[0]); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(C_tmp[1], bb[1], C[1]); + + vec_cmov(B[0], C[0], mask); + vec_cmov(B[1], C[1], mask); + update(B, c0 & mask); + + for (i = 0; i < GFBITS; i++) { + C[0][i] = B_tmp[0][i] ^ C_tmp[0][i]; + C[1][i] = B_tmp[1][i] ^ C_tmp[1][i]; + } + + c0 = (gf)(t >> 32); + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE8192128F_VEC_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + v[i] = PQCLEAN_MCELIECE8192128F_VEC_vec_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(out[0], C[0], v); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(out[1], C[1], v); +} + diff --git a/crypto_kem/mceliece8192128f/vec/bm.h b/crypto_kem/mceliece8192128f/vec/bm.h new file mode 100644 index 00000000..87e95ee4 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/bm.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_BM_H +#define PQCLEAN_MCELIECE8192128F_VEC_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE8192128F_VEC_bm(vec /*out*/[][GFBITS], vec /*in*/[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/consts.inc b/crypto_kem/mceliece8192128f/vec/consts.inc new file mode 100644 index 00000000..1875ca4d --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/consts.inc @@ -0,0 +1,1920 @@ +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0x6969969669699696, + 0x9966669966999966, + 0x9966669966999966, + 0xFF0000FF00FFFF00, + 0xCC3333CCCC3333CC, + 0x9966669966999966, + 0x6666666666666666, + 0xA55AA55AA55AA55A, + 0xCCCC33333333CCCC, + 0x5A5A5A5A5A5A5A5A, + 0x55AAAA55AA5555AA, + 0x0FF0F00FF00F0FF0, + 0x5AA55AA5A55AA55A +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x9999999966666666, + 0x3C3CC3C3C3C33C3C, + 0xFFFF0000FFFF0000, + 0x0000000000000000, + 0xCC33CC3333CC33CC, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0x00FFFF0000FFFF00 +}, +{ + 0xA55A5AA55AA5A55A, + 0x6969696996969696, + 0x5AA55AA5A55AA55A, + 0x6666666699999999, + 0xC3C33C3C3C3CC3C3, + 0x0000FFFF0000FFFF, + 0x0000000000000000, + 0x33CC33CCCC33CC33, + 0x0000000000000000, + 0x3C3C3C3C3C3C3C3C, + 0xAA5555AAAA5555AA, + 0xC33C3CC33CC3C33C, + 0xFF0000FFFF0000FF +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x9669966969966996, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0xA5A5A5A55A5A5A5A, + 0x0FF0F00FF00F0FF0, + 0x6996699696699669, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x6996699696699669, + 0x0000FFFFFFFF0000, + 0x33333333CCCCCCCC, + 0x5AA5A55AA55A5AA5, + 0xFF0000FFFF0000FF, + 0xFFFFFFFFFFFFFFFF, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAA55555555AAAA +}, +{ + 0xFFFFFFFF00000000, + 0x5A5A5A5AA5A5A5A5, + 0xF00F0FF00FF0F00F, + 0x9669966969966996, + 0xFFFF00000000FFFF, + 0x33333333CCCCCCCC, + 0xA55A5AA55AA5A55A, + 0x00FFFF0000FFFF00, + 0x0000000000000000, + 0xC33CC33CC33CC33C, + 0x0F0FF0F00F0FF0F0, + 0xCCCCCCCCCCCCCCCC, + 0x5555AAAAAAAA5555 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0x3CC33CC3C33CC33C, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0xAA55AA5555AA55AA, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0xC33CC33C3CC33CC3, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0x5AA5A55A5AA5A55A, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x9999999966666666, + 0xC33CC33CC33CC33C, + 0x6666999999996666 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x9966996699669966, + 0xA55A5AA5A55A5AA5, + 0xC3C3C3C33C3C3C3C, + 0xC33CC33C3CC33CC3, + 0x3333CCCC3333CCCC, + 0x6666666699999999, + 0x3CC33CC33CC33CC3, + 0x9999666666669999 +}, +{ + 0xC33C3CC33CC3C33C, + 0x6699996699666699, + 0x9966996699669966, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0x5AA5A55A5AA5A55A, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x6666666699999999, + 0xC33CC33CC33CC33C, + 0x9999666666669999 +}, +{ + 0x3CC3C33CC33C3CC3, + 0x9966669966999966, + 0x6699669966996699, + 0x6969969669699696, + 0x55AA55AAAA55AA55, + 0x6699669966996699, + 0xA55A5AA5A55A5AA5, + 0x3C3C3C3CC3C3C3C3, + 0x3CC33CC3C33CC33C, + 0xCCCC3333CCCC3333, + 0x9999999966666666, + 0x3CC33CC33CC33CC3, + 0x6666999999996666 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0xAA5555AA55AAAA55, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0x55555555AAAAAAAA, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0x55555555AAAAAAAA, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0x55AAAA55AA5555AA, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x9669699696696996, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0xAAAAAAAA55555555, + 0xCCCC33333333CCCC, + 0x0000FFFFFFFF0000, + 0xFF0000FF00FFFF00, + 0x6996699669966996 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0xA55AA55A5AA55AA5, + 0x55AAAA55AA5555AA, + 0x0FF0F00F0FF0F00F, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0xAAAAAAAA55555555, + 0x3333CCCCCCCC3333, + 0x0000FFFFFFFF0000, + 0x00FFFF00FF0000FF, + 0x9669966996699669 +}, +{ + 0x3C3CC3C3C3C33C3C, + 0xAAAAAAAA55555555, + 0xF00FF00F0FF00FF0, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0x5AA55AA55AA55AA5, + 0x55555555AAAAAAAA, + 0x3333CCCCCCCC3333, + 0xFFFF00000000FFFF, + 0xFF0000FF00FFFF00, + 0x9669966996699669 +}, +{ + 0xC3C33C3C3C3CC3C3, + 0xAAAAAAAA55555555, + 0x0FF00FF0F00FF00F, + 0x5AA55AA5A55AA55A, + 0xAA5555AA55AAAA55, + 0xF00F0FF0F00F0FF0, + 0x6996966969969669, + 0xA55AA55AA55AA55A, + 0x55555555AAAAAAAA, + 0xCCCC33333333CCCC, + 0xFFFF00000000FFFF, + 0x00FFFF00FF0000FF, + 0x6996699669966996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0xAAAAAAAAAAAAAAAA, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0x0000FFFF0000FFFF, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0xC33C3CC3C33C3CC3, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0x55AA55AA55AA55AA, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0xFFFF0000FFFF0000, + 0x0F0F0F0FF0F0F0F0, + 0x00FFFF00FF0000FF, + 0xCC3333CC33CCCC33, + 0xFF0000FF00FFFF00, + 0x6996966996696996, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x3CC3C33C3CC3C33C, + 0x5555555555555555, + 0xFFFF0000FFFF0000, + 0x3CC3C33C3CC3C33C, + 0xAA55AA55AA55AA55, + 0x0000FFFF0000FFFF, + 0xF0F0F0F00F0F0F0F, + 0xFF0000FF00FFFF00, + 0x33CCCC33CC3333CC, + 0x00FFFF00FF0000FF, + 0x9669699669969669, + 0xA55A5AA55AA5A55A, + 0x6996966996696996 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF0000FFFF0000, + 0xFF00FF00FF00FF00, + 0xF0F0F0F0F0F0F0F0, + 0xCCCCCCCCCCCCCCCC, + 0xAAAAAAAAAAAAAAAA +}, diff --git a/crypto_kem/mceliece8192128f/vec/controlbits.c b/crypto_kem/mceliece8192128f/vec/controlbits.c new file mode 100644 index 00000000..d62ca1f4 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE8192128F_VEC_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE8192128F_VEC_sort_63b(n / 2, x); + PQCLEAN_MCELIECE8192128F_VEC_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE8192128F_VEC_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/crypto_kem/mceliece8192128f/vec/controlbits.h b/crypto_kem/mceliece8192128f/vec/controlbits.h new file mode 100644 index 00000000..8f28b206 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_CONTROLBITS_H +#define PQCLEAN_MCELIECE8192128F_VEC_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE8192128F_VEC_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE8192128F_VEC_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/crypto_hash.h b/crypto_kem/mceliece8192128f/vec/crypto_hash.h new file mode 100644 index 00000000..a35032ab --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE8192128F_VEC_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/crypto_kem/mceliece8192128f/vec/decrypt.c b/crypto_kem/mceliece8192128f/vec/decrypt.c new file mode 100644 index 00000000..405c0019 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/decrypt.c @@ -0,0 +1,168 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" +#include "vec.h" + +#include + +static void scaling(vec out[][GFBITS], vec inv[][GFBITS], const unsigned char *sk, const vec *recv) { + int i, j; + + vec irr_int[2][ GFBITS ]; + vec eval[128][ GFBITS ]; + vec tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE8192128F_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE8192128F_VEC_fft(eval, irr_int); + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE8192128F_VEC_vec_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE8192128F_VEC_vec_copy(inv[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128F_VEC_vec_inv(tmp, inv[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128F_VEC_vec_copy(inv[0], tmp); + + // + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static void scaling_inv(vec out[][GFBITS], vec inv[][GFBITS], const vec *recv) { + int i, j; + + for (i = 0; i < 128; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = inv[i][j] & recv[i]; + } + } +} + +static void preprocess(vec *recv, const unsigned char *s) { + int i; + + recv[0] = 0; + + for (i = 1; i < 128; i++) { + recv[i] = recv[0]; + } + + for (i = 0; i < SYND_BYTES / 8; i++) { + recv[i] = PQCLEAN_MCELIECE8192128F_VEC_load8(s + i * 8); + } +} + +static uint16_t weight(const vec *v) { + uint16_t i, w = 0; + + for (i = 0; i < SYS_N; i++) { + w += (uint16_t)((v[i / 64] >> (i % 64)) & 1); + } + + return w; +} + +static uint16_t synd_cmp(vec s0[][ GFBITS ], vec s1[][ GFBITS ]) { + int i, j; + vec diff = 0; + + for (i = 0; i < 4; i++) { + for (j = 0; j < GFBITS; j++) { + diff |= (s0[i][j] ^ s1[i][j]); + } + } + + return (uint16_t)PQCLEAN_MCELIECE8192128F_VEC_vec_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE8192128F_VEC_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec inv[ 128 ][ GFBITS ]; + vec scaled[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + + vec error[ 128 ]; + + vec s_priv[ 4 ][ GFBITS ]; + vec s_priv_cmp[ 4 ][ GFBITS ]; + vec locator[2][ GFBITS ]; + + vec recv[ 128 ]; + vec allone; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE8192128F_VEC_benes(recv, sk + IRR_BYTES, 1); + scaling(scaled, inv, sk, recv); + PQCLEAN_MCELIECE8192128F_VEC_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE8192128F_VEC_bm(locator, s_priv); + + PQCLEAN_MCELIECE8192128F_VEC_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE8192128F_VEC_vec_setbits(1); + + for (i = 0; i < 128; i++) { + error[i] = PQCLEAN_MCELIECE8192128F_VEC_vec_or_reduce(eval[i]); + error[i] ^= allone; + } + + check_weight = weight(error) ^ SYS_T; + check_weight -= 1; + check_weight >>= 15; + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE8192128F_VEC_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE8192128F_VEC_benes(error, sk + IRR_BYTES, 0); + + for (i = 0; i < 128; i++) { + PQCLEAN_MCELIECE8192128F_VEC_store8(e + i * 8, error[i]); + } + + return 1 - (check_synd & check_weight); +} + diff --git a/crypto_kem/mceliece8192128f/vec/decrypt.h b/crypto_kem/mceliece8192128f/vec/decrypt.h new file mode 100644 index 00000000..220e41cf --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_DECRYPT_H +#define PQCLEAN_MCELIECE8192128F_VEC_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE8192128F_VEC_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/encrypt.c b/crypto_kem/mceliece8192128f/vec/encrypt.c new file mode 100644 index 00000000..85ccaa5c --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/encrypt.c @@ -0,0 +1,116 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq; + + uint16_t ind[ SYS_T ]; + uint8_t *ind8 = (uint8_t *)ind; + uint64_t e_int[ SYS_N / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes(ind8, sizeof(ind)); + for (i = 0; i < sizeof(ind); i += 2) { + ind[i / 2] = (uint16_t)ind8[i + 1] << 8 | ind8[i]; + } + + for (i = 0; i < SYS_T; i++) { + ind[i] &= GFMASK; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind[j] & 63); + } + + for (i = 0; i < SYS_N / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < SYS_N / 64; i++) { + PQCLEAN_MCELIECE8192128F_VEC_store8(e + i * 8, e_int[i]); + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + uint64_t b; + + const uint8_t *e_ptr8 = e + SYND_BYTES; + const uint8_t *pk_ptr8; + + int i, j; + + // + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = e[i]; + } + + for (i = 0; i < PK_NROWS; i++) { + pk_ptr8 = pk + PK_ROW_BYTES * i; + + b = 0; + for (j = 0; j < PK_NCOLS / 64; j++) { + b ^= PQCLEAN_MCELIECE8192128F_VEC_load8(pk_ptr8 + 8 * j) & PQCLEAN_MCELIECE8192128F_VEC_load8(e_ptr8 + 8 * j); + } + + b ^= b >> 32; + b ^= b >> 16; + b ^= b >> 8; + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] ^= (b << (i % 8)); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE8192128F_VEC_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + syndrome(s, pk, e); +} + diff --git a/crypto_kem/mceliece8192128f/vec/encrypt.h b/crypto_kem/mceliece8192128f/vec/encrypt.h new file mode 100644 index 00000000..bdbb115c --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_ENCRYPT_H +#define PQCLEAN_MCELIECE8192128F_VEC_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE8192128F_VEC_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/fft.c b/crypto_kem/mceliece8192128f/vec/fft.c new file mode 100644 index 00000000..16a69b2b --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/fft.c @@ -0,0 +1,274 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec in[][GFBITS]) { + int i, j, k; + + const vec mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const vec s[5][2][GFBITS] = { +#include "scalars_2x.inc" + }; + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[1][i] >> 32; + in[0][i] ^= in[1][i] << 32; + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[0][i] ^= (in[0][i] & mask[k][0]) >> (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) >> (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) >> (1 << k); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(in[0], in[0], s[j][0]); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(in[1], in[1], s[j][1]); + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[8][ GFBITS ]; + vec buf[128]; + + uint64_t consts_ptr = 2; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec powers[ 128 ][ GFBITS ] = { +#include "powers.inc" + }; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[7] = {2522, 7827, 7801, 8035, 6897, 8167, 3476}; + + // + + for (i = 0; i < 7; i++) { + for (j = 0; j < GFBITS; j++) { + pre[i][j] = (beta[i] >> j) & 1; + pre[i][j] = -pre[i][j]; + } + + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(pre[i], in[1], pre[i]); + } + + for (i = 0; i < GFBITS; i++) { + buf[0] = in[0][i]; + + buf[1] = buf[0] ^ pre[0][i]; + buf[32] = in[0][i] ^ pre[5][i]; + buf[3] = buf[1] ^ pre[1][i]; + buf[96] = buf[32] ^ pre[6][i]; + buf[97] = buf[96] ^ pre[0][i]; + buf[2] = in[0][i] ^ pre[1][i]; + buf[99] = buf[97] ^ pre[1][i]; + buf[6] = buf[2] ^ pre[2][i]; + buf[98] = buf[99] ^ pre[0][i]; + buf[7] = buf[6] ^ pre[0][i]; + buf[102] = buf[98] ^ pre[2][i]; + buf[5] = buf[7] ^ pre[1][i]; + buf[103] = buf[102] ^ pre[0][i]; + buf[101] = buf[103] ^ pre[1][i]; + buf[4] = in[0][i] ^ pre[2][i]; + buf[100] = buf[101] ^ pre[0][i]; + buf[12] = buf[4] ^ pre[3][i]; + buf[108] = buf[100] ^ pre[3][i]; + buf[13] = buf[12] ^ pre[0][i]; + buf[109] = buf[108] ^ pre[0][i]; + buf[15] = buf[13] ^ pre[1][i]; + buf[111] = buf[109] ^ pre[1][i]; + buf[14] = buf[15] ^ pre[0][i]; + buf[110] = buf[111] ^ pre[0][i]; + buf[10] = buf[14] ^ pre[2][i]; + buf[106] = buf[110] ^ pre[2][i]; + buf[11] = buf[10] ^ pre[0][i]; + buf[107] = buf[106] ^ pre[0][i]; + buf[9] = buf[11] ^ pre[1][i]; + buf[105] = buf[107] ^ pre[1][i]; + buf[104] = buf[105] ^ pre[0][i]; + buf[8] = in[0][i] ^ pre[3][i]; + buf[120] = buf[104] ^ pre[4][i]; + buf[24] = buf[8] ^ pre[4][i]; + buf[121] = buf[120] ^ pre[0][i]; + buf[25] = buf[24] ^ pre[0][i]; + buf[123] = buf[121] ^ pre[1][i]; + buf[27] = buf[25] ^ pre[1][i]; + buf[122] = buf[123] ^ pre[0][i]; + buf[26] = buf[27] ^ pre[0][i]; + buf[126] = buf[122] ^ pre[2][i]; + buf[30] = buf[26] ^ pre[2][i]; + buf[127] = buf[126] ^ pre[0][i]; + buf[31] = buf[30] ^ pre[0][i]; + buf[125] = buf[127] ^ pre[1][i]; + buf[29] = buf[31] ^ pre[1][i]; + buf[124] = buf[125] ^ pre[0][i]; + buf[28] = buf[29] ^ pre[0][i]; + buf[116] = buf[124] ^ pre[3][i]; + buf[20] = buf[28] ^ pre[3][i]; + buf[117] = buf[116] ^ pre[0][i]; + buf[21] = buf[20] ^ pre[0][i]; + buf[119] = buf[117] ^ pre[1][i]; + buf[23] = buf[21] ^ pre[1][i]; + buf[118] = buf[119] ^ pre[0][i]; + buf[22] = buf[23] ^ pre[0][i]; + buf[114] = buf[118] ^ pre[2][i]; + buf[18] = buf[22] ^ pre[2][i]; + buf[115] = buf[114] ^ pre[0][i]; + buf[19] = buf[18] ^ pre[0][i]; + buf[113] = buf[115] ^ pre[1][i]; + buf[17] = buf[19] ^ pre[1][i]; + buf[112] = buf[113] ^ pre[0][i]; + buf[80] = buf[112] ^ pre[5][i]; + buf[16] = in[0][i] ^ pre[4][i]; + buf[81] = buf[80] ^ pre[0][i]; + buf[48] = buf[16] ^ pre[5][i]; + buf[83] = buf[81] ^ pre[1][i]; + buf[49] = buf[48] ^ pre[0][i]; + buf[82] = buf[83] ^ pre[0][i]; + buf[51] = buf[49] ^ pre[1][i]; + buf[86] = buf[82] ^ pre[2][i]; + buf[50] = buf[51] ^ pre[0][i]; + buf[87] = buf[86] ^ pre[0][i]; + buf[54] = buf[50] ^ pre[2][i]; + buf[85] = buf[87] ^ pre[1][i]; + buf[55] = buf[54] ^ pre[0][i]; + buf[84] = buf[85] ^ pre[0][i]; + buf[53] = buf[55] ^ pre[1][i]; + buf[92] = buf[84] ^ pre[3][i]; + buf[52] = buf[53] ^ pre[0][i]; + buf[93] = buf[92] ^ pre[0][i]; + buf[60] = buf[52] ^ pre[3][i]; + buf[95] = buf[93] ^ pre[1][i]; + buf[61] = buf[60] ^ pre[0][i]; + buf[94] = buf[95] ^ pre[0][i]; + buf[63] = buf[61] ^ pre[1][i]; + buf[90] = buf[94] ^ pre[2][i]; + buf[62] = buf[63] ^ pre[0][i]; + buf[91] = buf[90] ^ pre[0][i]; + buf[58] = buf[62] ^ pre[2][i]; + buf[89] = buf[91] ^ pre[1][i]; + buf[59] = buf[58] ^ pre[0][i]; + buf[88] = buf[89] ^ pre[0][i]; + buf[57] = buf[59] ^ pre[1][i]; + buf[72] = buf[88] ^ pre[4][i]; + buf[56] = buf[57] ^ pre[0][i]; + buf[73] = buf[72] ^ pre[0][i]; + buf[40] = buf[56] ^ pre[4][i]; + buf[75] = buf[73] ^ pre[1][i]; + buf[41] = buf[40] ^ pre[0][i]; + buf[74] = buf[75] ^ pre[0][i]; + buf[43] = buf[41] ^ pre[1][i]; + buf[78] = buf[74] ^ pre[2][i]; + buf[42] = buf[43] ^ pre[0][i]; + buf[79] = buf[78] ^ pre[0][i]; + buf[46] = buf[42] ^ pre[2][i]; + buf[77] = buf[79] ^ pre[1][i]; + buf[47] = buf[46] ^ pre[0][i]; + buf[76] = buf[77] ^ pre[0][i]; + buf[45] = buf[47] ^ pre[1][i]; + buf[68] = buf[76] ^ pre[3][i]; + buf[44] = buf[45] ^ pre[0][i]; + buf[69] = buf[68] ^ pre[0][i]; + buf[36] = buf[44] ^ pre[3][i]; + buf[71] = buf[69] ^ pre[1][i]; + buf[37] = buf[36] ^ pre[0][i]; + buf[70] = buf[71] ^ pre[0][i]; + buf[39] = buf[37] ^ pre[1][i]; + buf[66] = buf[70] ^ pre[2][i]; + buf[38] = buf[39] ^ pre[0][i]; + buf[67] = buf[66] ^ pre[0][i]; + buf[34] = buf[38] ^ pre[2][i]; + buf[65] = buf[67] ^ pre[1][i]; + buf[35] = buf[34] ^ pre[0][i]; + buf[33] = buf[35] ^ pre[1][i]; + buf[64] = in[0][i] ^ pre[6][i]; + + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(buf + 0, buf + 0); + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(buf + 64, buf + 64); + + for (j = 0; j < 128; j++) { + out[ reversal[j] ][i] = buf[j]; + } + } + + for (i = 1; i <= 6; i++) { + s = 1 << i; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k ][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += (uint64_t)1 << i; + } + + // adding the part contributed by x^128 + + for (i = 0; i < 128; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] ^= powers[i][b]; + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE8192128F_VEC_fft(vec out[][GFBITS], vec in[][GFBITS]) { + radix_conversions(in); + butterflies(out, in); +} diff --git a/crypto_kem/mceliece8192128f/vec/fft.h b/crypto_kem/mceliece8192128f/vec/fft.h new file mode 100644 index 00000000..ea2e7fd9 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/fft.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_FFT_H +#define PQCLEAN_MCELIECE8192128F_VEC_FFT_H +/* + This file is for the Gao-Mateer FFT + see http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE8192128F_VEC_fft(vec /*out*/[][GFBITS], vec /*in*/[][GFBITS]); + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/fft_tr.c b/crypto_kem/mceliece8192128f/vec/fft_tr.c new file mode 100644 index 00000000..0f4b8ca1 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/fft_tr.c @@ -0,0 +1,289 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec in[][ GFBITS ]) { + int i, j, k; + + const vec mask[6][2] = { + {0x2222222222222222, 0x4444444444444444}, + {0x0C0C0C0C0C0C0C0C, 0x3030303030303030}, + {0x00F000F000F000F0, 0x0F000F000F000F00}, + {0x0000FF000000FF00, 0x00FF000000FF0000}, + {0x00000000FFFF0000, 0x0000FFFF00000000}, + {0xFFFFFFFF00000000, 0x00000000FFFFFFFF} + }; + + const vec s[6][4][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(in[0], in[0], s[j][0]); // scaling + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(in[1], in[1], s[j][1]); // scaling + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(in[2], in[2], s[j][2]); // scaling + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(in[3], in[3], s[j][3]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + in[0][i] ^= (in[0][i] & mask[k][0]) << (1 << k); + in[0][i] ^= (in[0][i] & mask[k][1]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][0]) << (1 << k); + in[1][i] ^= (in[1][i] & mask[k][1]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][0]) << (1 << k); + in[2][i] ^= (in[2][i] & mask[k][1]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][0]) << (1 << k); + in[3][i] ^= (in[3][i] & mask[k][1]) << (1 << k); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + in[1][i] ^= in[0][i] >> 32; + in[1][i] ^= in[1][i] << 32; + + in[3][i] ^= in[2][i] >> 32; + in[3][i] ^= in[3][i] << 32; + } + } + + for (i = 0; i < GFBITS; i++) { + in[3][i] ^= in[2][i] ^= in[1][i]; + } + } +} + +static void butterflies_tr(vec out[][ GFBITS ], vec in[][ GFBITS ]) { + int i, j, k, s, b; + + vec tmp[ GFBITS ]; + vec pre[6][2][ GFBITS ]; + vec buf[2][64]; + + const vec consts[ 128 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 128; + + const unsigned char reversal[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, + 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, + 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, + 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, + 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, + 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, + 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, + 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, + 15, 79, 47, 111, 31, 95, 63, 127 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // + + for (i = 6; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 128; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(tmp, in[k], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tmp[b]; + } + } + } + } + + for (i = 0; i < GFBITS; i++) { + for (k = 0; k < 128; k++) { + (&buf[0][0])[ k ] = in[ reversal[k] ][i]; + } + + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(buf[0], buf[0]); + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(buf[1], buf[1]); + + for (k = 0; k < 2; k++) { + pre[0][k][i] = buf[k][32]; + buf[k][33] ^= buf[k][32]; + pre[1][k][i] = buf[k][33]; + buf[k][35] ^= buf[k][33]; + pre[0][k][i] ^= buf[k][35]; + buf[k][34] ^= buf[k][35]; + pre[2][k][i] = buf[k][34]; + buf[k][38] ^= buf[k][34]; + pre[0][k][i] ^= buf[k][38]; + buf[k][39] ^= buf[k][38]; + pre[1][k][i] ^= buf[k][39]; + buf[k][37] ^= buf[k][39]; + pre[0][k][i] ^= buf[k][37]; + buf[k][36] ^= buf[k][37]; + pre[3][k][i] = buf[k][36]; + buf[k][44] ^= buf[k][36]; + pre[0][k][i] ^= buf[k][44]; + buf[k][45] ^= buf[k][44]; + pre[1][k][i] ^= buf[k][45]; + buf[k][47] ^= buf[k][45]; + pre[0][k][i] ^= buf[k][47]; + buf[k][46] ^= buf[k][47]; + pre[2][k][i] ^= buf[k][46]; + buf[k][42] ^= buf[k][46]; + pre[0][k][i] ^= buf[k][42]; + buf[k][43] ^= buf[k][42]; + pre[1][k][i] ^= buf[k][43]; + buf[k][41] ^= buf[k][43]; + pre[0][k][i] ^= buf[k][41]; + buf[k][40] ^= buf[k][41]; + pre[4][k][i] = buf[k][40]; + buf[k][56] ^= buf[k][40]; + pre[0][k][i] ^= buf[k][56]; + buf[k][57] ^= buf[k][56]; + pre[1][k][i] ^= buf[k][57]; + buf[k][59] ^= buf[k][57]; + pre[0][k][i] ^= buf[k][59]; + buf[k][58] ^= buf[k][59]; + pre[2][k][i] ^= buf[k][58]; + buf[k][62] ^= buf[k][58]; + pre[0][k][i] ^= buf[k][62]; + buf[k][63] ^= buf[k][62]; + pre[1][k][i] ^= buf[k][63]; + buf[k][61] ^= buf[k][63]; + pre[0][k][i] ^= buf[k][61]; + buf[k][60] ^= buf[k][61]; + pre[3][k][i] ^= buf[k][60]; + buf[k][52] ^= buf[k][60]; + pre[0][k][i] ^= buf[k][52]; + buf[k][53] ^= buf[k][52]; + pre[1][k][i] ^= buf[k][53]; + buf[k][55] ^= buf[k][53]; + pre[0][k][i] ^= buf[k][55]; + buf[k][54] ^= buf[k][55]; + pre[2][k][i] ^= buf[k][54]; + buf[k][50] ^= buf[k][54]; + pre[0][k][i] ^= buf[k][50]; + buf[k][51] ^= buf[k][50]; + pre[1][k][i] ^= buf[k][51]; + buf[k][49] ^= buf[k][51]; + pre[0][k][i] ^= buf[k][49]; + buf[k][48] ^= buf[k][49]; + pre[5][k][i] = buf[k][48]; + buf[k][16] ^= buf[k][48]; + pre[0][k][i] ^= buf[k][16]; + buf[k][17] ^= buf[k][16]; + pre[1][k][i] ^= buf[k][17]; + buf[k][19] ^= buf[k][17]; + pre[0][k][i] ^= buf[k][19]; + buf[k][18] ^= buf[k][19]; + pre[2][k][i] ^= buf[k][18]; + buf[k][22] ^= buf[k][18]; + pre[0][k][i] ^= buf[k][22]; + buf[k][23] ^= buf[k][22]; + pre[1][k][i] ^= buf[k][23]; + buf[k][21] ^= buf[k][23]; + pre[0][k][i] ^= buf[k][21]; + buf[k][20] ^= buf[k][21]; + pre[3][k][i] ^= buf[k][20]; + buf[k][28] ^= buf[k][20]; + pre[0][k][i] ^= buf[k][28]; + buf[k][29] ^= buf[k][28]; + pre[1][k][i] ^= buf[k][29]; + buf[k][31] ^= buf[k][29]; + pre[0][k][i] ^= buf[k][31]; + buf[k][30] ^= buf[k][31]; + pre[2][k][i] ^= buf[k][30]; + buf[k][26] ^= buf[k][30]; + pre[0][k][i] ^= buf[k][26]; + buf[k][27] ^= buf[k][26]; + pre[1][k][i] ^= buf[k][27]; + buf[k][25] ^= buf[k][27]; + pre[0][k][i] ^= buf[k][25]; + buf[k][24] ^= buf[k][25]; + pre[4][k][i] ^= buf[k][24]; + buf[k][8] ^= buf[k][24]; + pre[0][k][i] ^= buf[k][8]; + buf[k][9] ^= buf[k][8]; + pre[1][k][i] ^= buf[k][9]; + buf[k][11] ^= buf[k][9]; + pre[0][k][i] ^= buf[k][11]; + buf[k][10] ^= buf[k][11]; + pre[2][k][i] ^= buf[k][10]; + buf[k][14] ^= buf[k][10]; + pre[0][k][i] ^= buf[k][14]; + buf[k][15] ^= buf[k][14]; + pre[1][k][i] ^= buf[k][15]; + buf[k][13] ^= buf[k][15]; + pre[0][k][i] ^= buf[k][13]; + buf[k][12] ^= buf[k][13]; + pre[3][k][i] ^= buf[k][12]; + buf[k][4] ^= buf[k][12]; + pre[0][k][i] ^= buf[k][4]; + buf[k][5] ^= buf[k][4]; + pre[1][k][i] ^= buf[k][5]; + buf[k][7] ^= buf[k][5]; + pre[0][k][i] ^= buf[k][7]; + buf[k][6] ^= buf[k][7]; + pre[2][k][i] ^= buf[k][6]; + buf[k][2] ^= buf[k][6]; + pre[0][k][i] ^= buf[k][2]; + buf[k][3] ^= buf[k][2]; + pre[1][k][i] ^= buf[k][3]; + buf[k][1] ^= buf[k][3]; + + pre[0][k][i] ^= buf[k][1]; + out[k][i] = buf[k][0] ^ buf[k][1]; + } + } + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128F_VEC_vec_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(out[2], pre[0][0], tmp); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(out[3], pre[0][1], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128F_VEC_vec_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(pre[i][0], pre[i][0], tmp); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(pre[i][1], pre[i][1], tmp); + + for (b = 0; b < GFBITS; b++) { + out[2][b] ^= pre[i][0][b]; + out[3][b] ^= pre[i][1][b]; + } + } + +} + +void PQCLEAN_MCELIECE8192128F_VEC_fft_tr(vec out[][GFBITS], vec in[][ GFBITS ]) { + butterflies_tr(out, in); + + radix_conversions_tr(out); +} + diff --git a/crypto_kem/mceliece8192128f/vec/fft_tr.h b/crypto_kem/mceliece8192128f/vec/fft_tr.h new file mode 100644 index 00000000..b5f9b429 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_FFT_TR_H +#define PQCLEAN_MCELIECE8192128F_VEC_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec.h" + +void PQCLEAN_MCELIECE8192128F_VEC_fft_tr(vec out[][GFBITS], vec in[][ GFBITS ]); + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/gf.c b/crypto_kem/mceliece8192128f/vec/gf.c new file mode 100644 index 00000000..a0f43674 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128F_VEC_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE8192128F_VEC_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE8192128F_VEC_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE8192128F_VEC_gf_inv(gf in) { + return PQCLEAN_MCELIECE8192128F_VEC_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE8192128F_VEC_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE8192128F_VEC_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE8192128F_VEC_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE8192128F_VEC_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE8192128F_VEC_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE8192128F_VEC_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/crypto_kem/mceliece8192128f/vec/gf.h b/crypto_kem/mceliece8192128f/vec/gf.h new file mode 100644 index 00000000..c7fd7559 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/gf.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_GF_H +#define PQCLEAN_MCELIECE8192128F_VEC_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE8192128F_VEC_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE8192128F_VEC_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE8192128F_VEC_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE8192128F_VEC_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE8192128F_VEC_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128F_VEC_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/operations.c b/crypto_kem/mceliece8192128f/vec/operations.c new file mode 100644 index 00000000..3eb03f90 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE8192128F_VEC_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE8192128F_VEC_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE8192128F_VEC_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE8192128F_VEC_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE8192128F_VEC_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE8192128F_VEC_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE8192128F_VEC_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE8192128F_VEC_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE8192128F_VEC_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE8192128F_VEC_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128f/vec/params.h b/crypto_kem/mceliece8192128f/vec/params.h new file mode 100644 index 00000000..400d08dd --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_PARAMS_H +#define PQCLEAN_MCELIECE8192128F_VEC_PARAMS_H + +#define GFBITS 13 +#define SYS_N 8192 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/pk_gen.c b/crypto_kem/mceliece8192128f/vec/pk_gen.c new file mode 100644 index 00000000..942173c2 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/pk_gen.c @@ -0,0 +1,301 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "transpose.h" +#include "util.h" +#include "vec.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec in[][GFBITS]) { + int i, j, r; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 0; r < 64; r++) { + out[i * 64 + r] <<= 1; + out[i * 64 + r] |= (in[i][j] >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec out0[][GFBITS], vec out1[][GFBITS], const uint64_t *in) { + int i, j, r; + + for (i = 0; i < 128; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out1[i][j] <<= 1; + out1[i][j] |= (in[i * 64 + r] >> (j + GFBITS)) & 1; + } + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (r = 63; r >= 0; r--) { + out0[i][GFBITS - 1 - j] <<= 1; + out0[i][GFBITS - 1 - j] |= (in[i * 64 + r] >> j) & 1; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + int i, b, m = 0, r = 0; + + for (i = 0; i < 64; i++) { + b = (int)(in >> i) & 1; + m |= b; + r += (m ^ 1) & (b ^ 1); + } + + return r; +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ 128 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask((uint16_t)k, (uint16_t)ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << 32 >> 32) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> 32 << 32) | (buf[j] >> 32); + } + } + + return 0; +} + +int PQCLEAN_MCELIECE8192128F_VEC_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int nblocks_H = (SYS_N + 63) / 64; + const int nblocks_I = (GFBITS * SYS_T + 63) / 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ 128 ]; + + uint64_t mask; + + vec irr_int[2][ GFBITS ]; + + vec consts[ 128 ][ GFBITS ]; + vec eval[ 128 ][ GFBITS ]; + vec prod[ 128 ][ GFBITS ]; + vec tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE8192128F_VEC_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE8192128F_VEC_fft(eval, irr_int); + + PQCLEAN_MCELIECE8192128F_VEC_vec_copy(prod[0], eval[0]); + + for (i = 1; i < 128; i++) { + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128F_VEC_vec_inv(tmp, prod[127]); + + for (i = 126; i >= 0; i--) { + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128F_VEC_vec_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE8192128F_VEC_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < nblocks_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ j ] = prod[ j ][ k ]; + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < nblocks_H; j++) { + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ j ] = prod[ j ][ k ]; + } + } + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < 128; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < 128; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < 128; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = nblocks_I; j < 128; j++) { + PQCLEAN_MCELIECE8192128F_VEC_store8(pk, mat[i][j]); + pk += 8; + } + } + + // + + return 0; +} + diff --git a/crypto_kem/mceliece8192128f/vec/pk_gen.h b/crypto_kem/mceliece8192128f/vec/pk_gen.h new file mode 100644 index 00000000..96c12e6a --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_PK_GEN_H +#define PQCLEAN_MCELIECE8192128F_VEC_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE8192128F_VEC_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/powers.inc b/crypto_kem/mceliece8192128f/vec/powers.inc new file mode 100644 index 00000000..a9bd6179 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/powers.inc @@ -0,0 +1,1920 @@ +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0x00000000FFFFFFFF, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0x5A5A5A5A5A5A5A5A, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0xCC33CC33CC33CC33, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0xCC33CC33CC33CC33, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x9696969669696969, + 0xA5A5A5A5A5A5A5A5, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0x0F0FF0F00F0FF0F0 +}, +{ + 0xA55AA55A5AA55AA5, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0x5A5A5A5A5A5A5A5A, + 0xA5A5A5A55A5A5A5A, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0x3CC33CC3C33CC33C, + 0xA5A55A5AA5A55A5A, + 0x0000FFFF0000FFFF, + 0x33CC33CC33CC33CC, + 0xF00FF00F0FF00FF0, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0x5555AAAAAAAA5555, + 0xF00FF00FF00FF00F, + 0xF0F00F0FF0F00F0F +}, +{ + 0x5AA55AA5A55AA55A, + 0xC33CC33C3CC33CC3, + 0xA5A55A5AA5A55A5A, + 0xFFFF0000FFFF0000, + 0x33CC33CC33CC33CC, + 0x0FF00FF0F00FF00F, + 0xFFFFFFFF00000000, + 0x6969696996969696, + 0xA5A5A5A5A5A5A5A5, + 0x5A5A5A5AA5A5A5A5, + 0xAAAA55555555AAAA, + 0x0FF00FF00FF00FF0, + 0x0F0FF0F00F0FF0F0 +} diff --git a/crypto_kem/mceliece8192128f/vec/scalars_2x.inc b/crypto_kem/mceliece8192128f/vec/scalars_2x.inc new file mode 100644 index 00000000..a0abb162 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/scalars_2x.inc @@ -0,0 +1,150 @@ +{{ + 0X3C3CF30C0000C003, + 0X0CCCC3F333C0000C, + 0X03C33F33FCC0C03C, + 0X0003000F3C03C0C0, + 0XF33FF33030CF03F0, + 0X0CF0303300F0CCC0, + 0XFF3F0C0CC0FF3CC0, + 0XCF3CF0FF003FC000, + 0XC00FF3CF0303F300, + 0X3CCC0CC00CF0CC00, + 0XF30FFC3C3FCCFC00, + 0X3F0FC3F0CCF0C000, + 0X3000FF33CCF0F000 +}, +{ + 0X0C0F0FCF0F0CF330, + 0XF0000FC33C3CCF3C, + 0X3C0F3F00C3C300FC, + 0X3C33CCC0F0F3CC30, + 0XC0CFFFFFCCCC30CC, + 0X3FC3F3CCFFFC033F, + 0XFC3030CCCCC0CFCF, + 0X0FCF0C00CCF333C3, + 0XCFFCF33000CFF030, + 0X00CFFCC330F30FCC, + 0X3CCC3FCCC0F3FFF3, + 0XF00F0C3FC003C0FF, + 0X330CCFCC03C0FC33 +}}, +{{ + 0X0F0F0FF0F000000F, + 0X00FFFFFFFF0000F0, + 0XFFFF00FF00000F00, + 0XFFF000F00F0FF000, + 0XFFF0000F0FF000F0, + 0X00FF000FFF000000, + 0XFF0F0FFF0F0FF000, + 0X0FFF0000000F0000, + 0X00F000F0FFF00F00, + 0X00F00FF00F00F000, + 0XFFF000F000F00000, + 0X00F00F000FF00000, + 0X0000FF0F0000F000 +}, +{ + 0XF0FFFFFFF0F00F00, + 0X00FFF0FFFF0000FF, + 0X00FF00000F0F0FFF, + 0XF000F0000F00FF0F, + 0XFF000000FFF00000, + 0XF0FF000FF00F0FF0, + 0X0F0F0F00FF000F0F, + 0X0F0F00F0F0F0F000, + 0X00F00F00F00F000F, + 0X00F0F0F00000FFF0, + 0XFFFFFF0FF00F0FFF, + 0X0F0FFFF00FFFFFFF, + 0XFFFF0F0FFF0FFF00 +}}, +{{ + 0X00FF0000000000FF, + 0XFFFFFFFFFF00FF00, + 0XFF0000FF00FF0000, + 0XFFFF000000FF0000, + 0XFF00000000FF0000, + 0X00FFFFFFFF000000, + 0XFF0000FFFFFF0000, + 0XFF00FF00FFFF0000, + 0X00FFFFFFFF00FF00, + 0XFFFF000000000000, + 0X00FF0000FF000000, + 0XFF00FF00FF000000, + 0X00FF00FFFF000000 +}, +{ + 0X00FF00FF00FF0000, + 0XFF00FFFF000000FF, + 0X0000FFFF000000FF, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF00FF, + 0X0000FFFF00FFFF00, + 0XFF00FF0000FFFF00, + 0X00000000FFFFFFFF, + 0X0000FF0000000000, + 0XFF00FFFF00FFFF00, + 0X00FFFF00000000FF, + 0X0000FF00FF00FFFF, + 0XFF0000FFFFFF0000 +}}, +{{ + 0X000000000000FFFF, + 0XFFFFFFFFFFFF0000, + 0X0000000000000000, + 0XFFFF0000FFFF0000, + 0XFFFFFFFFFFFF0000, + 0X0000FFFF00000000, + 0X0000FFFFFFFF0000, + 0XFFFF0000FFFF0000, + 0X0000FFFF00000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFF000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000FFFF00000000, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0X0000FFFF00000000, + 0XFFFF0000FFFF0000, + 0X0000FFFFFFFF0000, + 0X0000FFFF0000FFFF, + 0XFFFFFFFF0000FFFF, + 0X00000000FFFF0000, + 0XFFFF0000FFFFFFFF, + 0XFFFF0000FFFFFFFF, + 0X0000000000000000 +}}, +{{ + 0X00000000FFFFFFFF, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000 +}, +{ + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X0000000000000000, + 0X0000000000000000, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0XFFFFFFFFFFFFFFFF, + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFFFFFFFFFF, + 0XFFFFFFFF00000000 +}} diff --git a/crypto_kem/mceliece8192128f/vec/scalars_4x.inc b/crypto_kem/mceliece8192128f/vec/scalars_4x.inc new file mode 100644 index 00000000..cbaccec7 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/scalars_4x.inc @@ -0,0 +1,360 @@ +{{ + 0x3C3CF30C0000C003, + 0x0CCCC3F333C0000C, + 0x03C33F33FCC0C03C, + 0x0003000F3C03C0C0, + 0xF33FF33030CF03F0, + 0x0CF0303300F0CCC0, + 0xFF3F0C0CC0FF3CC0, + 0xCF3CF0FF003FC000, + 0xC00FF3CF0303F300, + 0x3CCC0CC00CF0CC00, + 0xF30FFC3C3FCCFC00, + 0x3F0FC3F0CCF0C000, + 0x3000FF33CCF0F000 +}, +{ + 0x0C0F0FCF0F0CF330, + 0xF0000FC33C3CCF3C, + 0x3C0F3F00C3C300FC, + 0x3C33CCC0F0F3CC30, + 0xC0CFFFFFCCCC30CC, + 0x3FC3F3CCFFFC033F, + 0xFC3030CCCCC0CFCF, + 0x0FCF0C00CCF333C3, + 0xCFFCF33000CFF030, + 0x00CFFCC330F30FCC, + 0x3CCC3FCCC0F3FFF3, + 0xF00F0C3FC003C0FF, + 0x330CCFCC03C0FC33 +}, +{ + 0xF0F30C33CF03F03F, + 0x00F30FC00C3300FF, + 0xF3CC3CF3F3FCF33F, + 0x3C0FC0FC303C3F3C, + 0xFC30CF303F3FF00F, + 0x33300C0CC3300CF3, + 0x3C030CF3F03FF3F3, + 0x3CCC03FCCC3FFC03, + 0x033C3C3CF0003FC3, + 0xFFC0FF00F0FF0F03, + 0xF3F30CF003FCC303, + 0x30CFCFC3CC0F3000, + 0x0CF30CCF3FCFCC0F +}, +{ + 0x3F30CC0C000F3FCC, + 0xFC3CF030FC3FFF03, + 0x33FFFCFF0CCF3CC3, + 0x003CFF33C3CC30CF, + 0xCFF3CF33C00F3003, + 0x00F3CC0CF3003CCF, + 0x3C000CFCCC3C3333, + 0xF3CF03C0FCF03FF0, + 0x3F3C3CF0C330330C, + 0x33CCFCC0FF0033F0, + 0x33C300C0F0C003F3, + 0x003FF0003F00C00C, + 0xCFF3C3033F030FFF +}}, +{{ + 0x0F0F0FF0F000000F, + 0x00FFFFFFFF0000F0, + 0xFFFF00FF00000F00, + 0xFFF000F00F0FF000, + 0xFFF0000F0FF000F0, + 0x00FF000FFF000000, + 0xFF0F0FFF0F0FF000, + 0x0FFF0000000F0000, + 0x00F000F0FFF00F00, + 0x00F00FF00F00F000, + 0xFFF000F000F00000, + 0x00F00F000FF00000, + 0x0000FF0F0000F000 +}, +{ + 0xF0FFFFFFF0F00F00, + 0x00FFF0FFFF0000FF, + 0x00FF00000F0F0FFF, + 0xF000F0000F00FF0F, + 0xFF000000FFF00000, + 0xF0FF000FF00F0FF0, + 0x0F0F0F00FF000F0F, + 0x0F0F00F0F0F0F000, + 0x00F00F00F00F000F, + 0x00F0F0F00000FFF0, + 0xFFFFFF0FF00F0FFF, + 0x0F0FFFF00FFFFFFF, + 0xFFFF0F0FFF0FFF00 +}, +{ + 0x0F0F00FF0FF0FFFF, + 0xF000F0F00F00FF0F, + 0x000FFFF0FFF0FF0F, + 0x00F00FFF00000FF0, + 0xFFFFF0000FFFF00F, + 0xFFF0FFF0000FFFF0, + 0xF0F0F0000F0F0F00, + 0x00F000F0F00FFF00, + 0xF0FF0F0FFF00F0FF, + 0xF0FF0FFFF0F0F0FF, + 0x00FFFFFFFFFFFFF0, + 0x00FFF0F0FF000F0F, + 0x000FFFF0000FFF00 +}, +{ + 0xFF0F0F00F000F0FF, + 0x0FFFFFFFFF00000F, + 0xF0FFFF000F00F0FF, + 0x0F0000F00FFF0FFF, + 0x0F0F0F00FF0F000F, + 0x000F0F0FFFF0F000, + 0xF0FFFF0F00F0FF0F, + 0x0F0F000F0F00F0FF, + 0x0000F0FF00FF0F0F, + 0x00FFFF0FF0FFF0F0, + 0x0000000F00F0FFF0, + 0xF0F00000FF00F0F0, + 0x0F0F0FFFFFFFFFFF +}}, +{{ + 0x00FF0000000000FF, + 0xFFFFFFFFFF00FF00, + 0xFF0000FF00FF0000, + 0xFFFF000000FF0000, + 0xFF00000000FF0000, + 0x00FFFFFFFF000000, + 0xFF0000FFFFFF0000, + 0xFF00FF00FFFF0000, + 0x00FFFFFFFF00FF00, + 0xFFFF000000000000, + 0x00FF0000FF000000, + 0xFF00FF00FF000000, + 0x00FF00FFFF000000 +}, +{ + 0x00FF00FF00FF0000, + 0xFF00FFFF000000FF, + 0x0000FFFF000000FF, + 0x00FFFF00FF000000, + 0xFFFFFF0000FF00FF, + 0x0000FFFF00FFFF00, + 0xFF00FF0000FFFF00, + 0x00000000FFFFFFFF, + 0x0000FF0000000000, + 0xFF00FFFF00FFFF00, + 0x00FFFF00000000FF, + 0x0000FF00FF00FFFF, + 0xFF0000FFFFFF0000 +}, +{ + 0xFFFF00FF00FF00FF, + 0x00FFFF000000FF00, + 0xFFFF00FFFFFFFF00, + 0x0000FFFF00FFFFFF, + 0x00FF0000FF0000FF, + 0xFFFF0000FF00FFFF, + 0xFF000000FFFFFF00, + 0x000000000000FFFF, + 0xFF00FF00FFFF0000, + 0xFFFF00FFFF00FFFF, + 0xFFFFFFFFFF00FF00, + 0xFFFF00FFFF0000FF, + 0x0000FF00000000FF +}, +{ + 0xFF0000FFFFFF00FF, + 0xFFFF0000FFFFFFFF, + 0xFFFF000000FFFFFF, + 0x00FFFF00FF0000FF, + 0xFFFFFF00FFFFFF00, + 0x00FFFF00FFFF00FF, + 0x0000FFFF00FF0000, + 0x000000FFFF000000, + 0xFF00FF0000FF00FF, + 0x00FF0000000000FF, + 0xFF00FFFF00FF00FF, + 0xFFFFFFFFFFFFFFFF, + 0x0000FF000000FFFF +}}, +{{ + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0x0000000000000000, + 0xFFFF0000FFFF0000, + 0xFFFFFFFFFFFF0000, + 0x0000FFFF00000000, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFF0000, + 0x0000FFFF00000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFF000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000FFFF00000000, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000FFFF00000000, + 0xFFFF0000FFFF0000, + 0x0000FFFFFFFF0000, + 0x0000FFFF0000FFFF, + 0xFFFFFFFF0000FFFF, + 0x00000000FFFF0000, + 0xFFFF0000FFFFFFFF, + 0xFFFF0000FFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFF000000000000, + 0x0000FFFF00000000, + 0x00000000FFFF0000, + 0x0000FFFFFFFFFFFF, + 0x0000FFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0x000000000000FFFF, + 0x000000000000FFFF, + 0xFFFFFFFFFFFF0000, + 0xFFFFFFFF0000FFFF, + 0xFFFF0000FFFFFFFF +}, +{ + 0x0000FFFFFFFFFFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFFFFFF0000, + 0xFFFF0000FFFFFFFF, + 0x00000000FFFF0000, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0xFFFF00000000FFFF, + 0x0000FFFF0000FFFF, + 0x0000FFFF00000000, + 0xFFFFFFFF00000000, + 0x0000FFFFFFFF0000, + 0x0000FFFFFFFFFFFF +}}, +{{ + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFF00000000 +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000 +}, +{ + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x00000000FFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFF00000000, + 0x00000000FFFFFFFF, + 0xFFFFFFFF00000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF00000000 +}}, +{{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000 +}, +{ + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000 +}, +{ + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF +}, +{ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF +}}, diff --git a/crypto_kem/mceliece8192128f/vec/sk_gen.c b/crypto_kem/mceliece8192128f/vec/sk_gen.c new file mode 100644 index 00000000..a5212c34 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE8192128F_VEC_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE8192128F_VEC_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE8192128F_VEC_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE8192128F_VEC_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE8192128F_VEC_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE8192128F_VEC_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE8192128F_VEC_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE8192128F_VEC_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/crypto_kem/mceliece8192128f/vec/sk_gen.h b/crypto_kem/mceliece8192128f/vec/sk_gen.h new file mode 100644 index 00000000..559aa3fe --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_SK_GEN_H +#define PQCLEAN_MCELIECE8192128F_VEC_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE8192128F_VEC_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE8192128F_VEC_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/transpose.c b/crypto_kem/mceliece8192128f/vec/transpose.c new file mode 100644 index 00000000..ccafa231 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/transpose.c @@ -0,0 +1,35 @@ +#include "transpose.h" + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} diff --git a/crypto_kem/mceliece8192128f/vec/transpose.h b/crypto_kem/mceliece8192128f/vec/transpose.h new file mode 100644 index 00000000..af38e59d --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/transpose.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_TRANSPOSE_H +#define PQCLEAN_MCELIECE8192128F_VEC_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + + +void PQCLEAN_MCELIECE8192128F_VEC_transpose_64x64(uint64_t *out, const uint64_t *in); + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/util.c b/crypto_kem/mceliece8192128f/vec/util.c new file mode 100644 index 00000000..172b25a7 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/util.c @@ -0,0 +1,76 @@ +#include "util.h" + +void PQCLEAN_MCELIECE8192128F_VEC_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE8192128F_VEC_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE8192128F_VEC_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE8192128F_VEC_irr_load(vec out[][GFBITS], const unsigned char *in) { + int i, j; + uint64_t v0 = 0, v1 = 0; + uint16_t irr[ SYS_T ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE8192128F_VEC_load2(in + i * 2); + irr[i] &= GFMASK; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 63; j >= 0; j--) { + v0 <<= 1; + v1 <<= 1; + v0 |= (irr[j] >> i) & 1; + v1 |= (irr[j + 64] >> i) & 1; + } + + out[0][i] = v0; + out[1][i] = v1; + } +} + +void PQCLEAN_MCELIECE8192128F_VEC_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE8192128F_VEC_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} diff --git a/crypto_kem/mceliece8192128f/vec/util.h b/crypto_kem/mceliece8192128f/vec/util.h new file mode 100644 index 00000000..0425e0ef --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/util.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_UTIL_H +#define PQCLEAN_MCELIECE8192128F_VEC_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec.h" + +#include + +void PQCLEAN_MCELIECE8192128F_VEC_store2(unsigned char *dest, uint16_t a); + +uint16_t PQCLEAN_MCELIECE8192128F_VEC_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE8192128F_VEC_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE8192128F_VEC_irr_load(vec out[][GFBITS], const unsigned char *in); + +void PQCLEAN_MCELIECE8192128F_VEC_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE8192128F_VEC_load8(const unsigned char *in); + +#endif + diff --git a/crypto_kem/mceliece8192128f/vec/vec.c b/crypto_kem/mceliece8192128f/vec/vec.c new file mode 100644 index 00000000..b11ac043 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/vec.c @@ -0,0 +1,138 @@ +#include "vec.h" + +#include "params.h" + +vec PQCLEAN_MCELIECE8192128F_VEC_vec_setbits(vec b) { + vec ret = -b; + + return ret; +} + +vec PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(uint16_t v) { + vec ret; + + ret = v; + ret |= ret << 16; + ret |= ret << 32; + + return ret; +} + +void PQCLEAN_MCELIECE8192128F_VEC_vec_copy(vec *out, const vec *in) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i] = in[i]; + } +} + +vec PQCLEAN_MCELIECE8192128F_VEC_vec_or_reduce(const vec *a) { + int i; + vec ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret |= a[i]; + } + + return ret; +} + +int PQCLEAN_MCELIECE8192128F_VEC_vec_testz(vec a) { + a |= a >> 32; + a |= a >> 16; + a |= a >> 8; + a |= a >> 4; + a |= a >> 2; + a |= a >> 1; + + return (int)(a & 1) ^ 1; +} + +void PQCLEAN_MCELIECE8192128F_VEC_vec_mul(vec *h, const vec *f, const vec *g) { + int i, j; + vec buf[ 2 * GFBITS - 1 ]; + + for (i = 0; i < 2 * GFBITS - 1; i++) { + buf[i] = 0; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < GFBITS; j++) { + buf[i + j] ^= f[i] & g[j]; + } + } + + for (i = 2 * GFBITS - 2; i >= GFBITS; i--) { + buf[i - GFBITS + 4] ^= buf[i]; + buf[i - GFBITS + 3] ^= buf[i]; + buf[i - GFBITS + 1] ^= buf[i]; + buf[i - GFBITS + 0] ^= buf[i]; + } + + for (i = 0; i < GFBITS; i++) { + h[i] = buf[i]; + } +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE8192128F_VEC_vec_sq(vec *out, const vec *in) { + int i; + vec result[GFBITS], t; + + t = in[11] ^ in[12]; + + result[0] = in[0] ^ in[11]; + result[1] = in[7] ^ t; + result[2] = in[1] ^ in[7]; + result[3] = in[8] ^ t; + result[4] = in[2] ^ in[7]; + result[4] = result[4] ^ in[8]; + result[4] = result[4] ^ t; + result[5] = in[7] ^ in[9]; + result[6] = in[3] ^ in[8]; + result[6] = result[6] ^ in[9]; + result[6] = result[6] ^ in[12]; + result[7] = in[8] ^ in[10]; + result[8] = in[4] ^ in[9]; + result[8] = result[8] ^ in[10]; + result[9] = in[9] ^ in[11]; + result[10] = in[5] ^ in[10]; + result[10] = result[10] ^ in[11]; + result[11] = in[10] ^ in[12]; + result[12] = in[6] ^ t; + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE8192128F_VEC_vec_inv(vec *out, const vec *in) { + vec tmp_11[ GFBITS ]; + vec tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE8192128F_VEC_vec_copy(out, in); + + PQCLEAN_MCELIECE8192128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(tmp_11, out, in); // ^11 + + PQCLEAN_MCELIECE8192128F_VEC_vec_sq(out, tmp_11); + PQCLEAN_MCELIECE8192128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(tmp_1111, out, tmp_11); // ^1111 + + PQCLEAN_MCELIECE8192128F_VEC_vec_sq(out, tmp_1111); + PQCLEAN_MCELIECE8192128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(out, out, tmp_1111); // ^11111111 + + PQCLEAN_MCELIECE8192128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128F_VEC_vec_sq(out, out); + PQCLEAN_MCELIECE8192128F_VEC_vec_mul(out, out, tmp_1111); // ^111111111111 + + PQCLEAN_MCELIECE8192128F_VEC_vec_sq(out, out); // ^1111111111110 +} + diff --git a/crypto_kem/mceliece8192128f/vec/vec.h b/crypto_kem/mceliece8192128f/vec/vec.h new file mode 100644 index 00000000..2b32b426 --- /dev/null +++ b/crypto_kem/mceliece8192128f/vec/vec.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE8192128F_VEC_VEC_H +#define PQCLEAN_MCELIECE8192128F_VEC_VEC_H + +#include "params.h" + +#include + +typedef uint64_t vec; + +vec PQCLEAN_MCELIECE8192128F_VEC_vec_setbits(vec b); + +vec PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(uint16_t v); + +void PQCLEAN_MCELIECE8192128F_VEC_vec_copy(vec *out, const vec *in); + +vec PQCLEAN_MCELIECE8192128F_VEC_vec_or_reduce(const vec *a); + +int PQCLEAN_MCELIECE8192128F_VEC_vec_testz(vec a); + +void PQCLEAN_MCELIECE8192128F_VEC_vec_mul(vec * /*h*/, const vec * /*f*/, const vec * /*g*/); +void PQCLEAN_MCELIECE8192128F_VEC_vec_sq(vec * /*out*/, const vec * /*in*/); +void PQCLEAN_MCELIECE8192128F_VEC_vec_inv(vec * /*out*/, const vec * /*in*/); + +#endif + diff --git a/test/duplicate_consistency/mceliece348864_avx.yml b/test/duplicate_consistency/mceliece348864_avx.yml new file mode 100644 index 00000000..ffac4dce --- /dev/null +++ b/test/duplicate_consistency/mceliece348864_avx.yml @@ -0,0 +1,293 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - scalars.inc + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - scalars.inc + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece348864_clean.yml b/test/duplicate_consistency/mceliece348864_clean.yml new file mode 100644 index 00000000..a8e46e4c --- /dev/null +++ b/test/duplicate_consistency/mceliece348864_clean.yml @@ -0,0 +1,395 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece348864_sse.yml b/test/duplicate_consistency/mceliece348864_sse.yml new file mode 100644 index 00000000..2785e005 --- /dev/null +++ b/test/duplicate_consistency/mceliece348864_sse.yml @@ -0,0 +1,293 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - scalars.inc + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - scalars.inc + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece348864_vec.yml b/test/duplicate_consistency/mceliece348864_vec.yml new file mode 100644 index 00000000..359b2606 --- /dev/null +++ b/test/duplicate_consistency/mceliece348864_vec.yml @@ -0,0 +1,345 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - fft.c + - fft.h + - fft_tr.c + - fft_tr.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - powers.inc + - scalars.inc + - scalars_2x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece348864f_avx.yml b/test/duplicate_consistency/mceliece348864f_avx.yml new file mode 100644 index 00000000..12695975 --- /dev/null +++ b/test/duplicate_consistency/mceliece348864f_avx.yml @@ -0,0 +1,293 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - scalars.inc + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - scalars.inc + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece348864f_clean.yml b/test/duplicate_consistency/mceliece348864f_clean.yml new file mode 100644 index 00000000..71c03a21 --- /dev/null +++ b/test/duplicate_consistency/mceliece348864f_clean.yml @@ -0,0 +1,395 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece348864f_sse.yml b/test/duplicate_consistency/mceliece348864f_sse.yml new file mode 100644 index 00000000..a45bd7bb --- /dev/null +++ b/test/duplicate_consistency/mceliece348864f_sse.yml @@ -0,0 +1,293 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - scalars.inc + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - scalars.inc + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece348864f_vec.yml b/test/duplicate_consistency/mceliece348864f_vec.yml new file mode 100644 index 00000000..882da6b8 --- /dev/null +++ b/test/duplicate_consistency/mceliece348864f_vec.yml @@ -0,0 +1,345 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - fft.c + - fft.h + - fft_tr.c + - fft_tr.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - powers.inc + - scalars.inc + - scalars_2x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece460896_avx.yml b/test/duplicate_consistency/mceliece460896_avx.yml new file mode 100644 index 00000000..bb726b67 --- /dev/null +++ b/test/duplicate_consistency/mceliece460896_avx.yml @@ -0,0 +1,297 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece460896_clean.yml b/test/duplicate_consistency/mceliece460896_clean.yml new file mode 100644 index 00000000..d0752b88 --- /dev/null +++ b/test/duplicate_consistency/mceliece460896_clean.yml @@ -0,0 +1,413 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece460896_sse.yml b/test/duplicate_consistency/mceliece460896_sse.yml new file mode 100644 index 00000000..6dab8691 --- /dev/null +++ b/test/duplicate_consistency/mceliece460896_sse.yml @@ -0,0 +1,297 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece460896_vec.yml b/test/duplicate_consistency/mceliece460896_vec.yml new file mode 100644 index 00000000..b6bc8668 --- /dev/null +++ b/test/duplicate_consistency/mceliece460896_vec.yml @@ -0,0 +1,390 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - fft.c + - fft.h + - fft_tr.c + - fft_tr.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - gf.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - gf.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.c + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - vec.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.c + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - vec.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece460896f_avx.yml b/test/duplicate_consistency/mceliece460896f_avx.yml new file mode 100644 index 00000000..11968ed7 --- /dev/null +++ b/test/duplicate_consistency/mceliece460896f_avx.yml @@ -0,0 +1,297 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece460896f_clean.yml b/test/duplicate_consistency/mceliece460896f_clean.yml new file mode 100644 index 00000000..edc78e64 --- /dev/null +++ b/test/duplicate_consistency/mceliece460896f_clean.yml @@ -0,0 +1,413 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece460896f_sse.yml b/test/duplicate_consistency/mceliece460896f_sse.yml new file mode 100644 index 00000000..ffac9f99 --- /dev/null +++ b/test/duplicate_consistency/mceliece460896f_sse.yml @@ -0,0 +1,297 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece460896f_vec.yml b/test/duplicate_consistency/mceliece460896f_vec.yml new file mode 100644 index 00000000..5da92d5c --- /dev/null +++ b/test/duplicate_consistency/mceliece460896f_vec.yml @@ -0,0 +1,394 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - fft.c + - fft.h + - fft_tr.c + - fft_tr.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - gf.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - gf.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.c + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.c + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6688128_avx.yml b/test/duplicate_consistency/mceliece6688128_avx.yml new file mode 100644 index 00000000..d59a36ba --- /dev/null +++ b/test/duplicate_consistency/mceliece6688128_avx.yml @@ -0,0 +1,295 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6688128_clean.yml b/test/duplicate_consistency/mceliece6688128_clean.yml new file mode 100644 index 00000000..4fa2ab25 --- /dev/null +++ b/test/duplicate_consistency/mceliece6688128_clean.yml @@ -0,0 +1,417 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6688128_sse.yml b/test/duplicate_consistency/mceliece6688128_sse.yml new file mode 100644 index 00000000..d59a36ba --- /dev/null +++ b/test/duplicate_consistency/mceliece6688128_sse.yml @@ -0,0 +1,295 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6688128_vec.yml b/test/duplicate_consistency/mceliece6688128_vec.yml new file mode 100644 index 00000000..37401839 --- /dev/null +++ b/test/duplicate_consistency/mceliece6688128_vec.yml @@ -0,0 +1,401 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - gf.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - gf.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - fft.c + - fft.h + - fft_tr.c + - fft_tr.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - powers.inc + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.c + - operations.c + - powers.inc + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.c + - operations.c + - powers.inc + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6688128f_avx.yml b/test/duplicate_consistency/mceliece6688128f_avx.yml new file mode 100644 index 00000000..d59a36ba --- /dev/null +++ b/test/duplicate_consistency/mceliece6688128f_avx.yml @@ -0,0 +1,295 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6688128f_clean.yml b/test/duplicate_consistency/mceliece6688128f_clean.yml new file mode 100644 index 00000000..684c5406 --- /dev/null +++ b/test/duplicate_consistency/mceliece6688128f_clean.yml @@ -0,0 +1,417 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6688128f_sse.yml b/test/duplicate_consistency/mceliece6688128f_sse.yml new file mode 100644 index 00000000..d59a36ba --- /dev/null +++ b/test/duplicate_consistency/mceliece6688128f_sse.yml @@ -0,0 +1,295 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6688128f_vec.yml b/test/duplicate_consistency/mceliece6688128f_vec.yml new file mode 100644 index 00000000..9599e6f7 --- /dev/null +++ b/test/duplicate_consistency/mceliece6688128f_vec.yml @@ -0,0 +1,401 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - gf.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - gf.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - fft.c + - fft.h + - fft_tr.c + - fft_tr.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - powers.inc + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.c + - operations.c + - powers.inc + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.c + - operations.c + - powers.inc + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6960119_avx.yml b/test/duplicate_consistency/mceliece6960119_avx.yml new file mode 100644 index 00000000..edd955ee --- /dev/null +++ b/test/duplicate_consistency/mceliece6960119_avx.yml @@ -0,0 +1,297 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6960119_clean.yml b/test/duplicate_consistency/mceliece6960119_clean.yml new file mode 100644 index 00000000..414f4613 --- /dev/null +++ b/test/duplicate_consistency/mceliece6960119_clean.yml @@ -0,0 +1,400 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6960119_sse.yml b/test/duplicate_consistency/mceliece6960119_sse.yml new file mode 100644 index 00000000..edd955ee --- /dev/null +++ b/test/duplicate_consistency/mceliece6960119_sse.yml @@ -0,0 +1,297 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6960119_vec.yml b/test/duplicate_consistency/mceliece6960119_vec.yml new file mode 100644 index 00000000..1c639633 --- /dev/null +++ b/test/duplicate_consistency/mceliece6960119_vec.yml @@ -0,0 +1,378 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.c + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - vec.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.c + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - fft.c + - fft.h + - fft_tr.c + - fft_tr.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6960119f_avx.yml b/test/duplicate_consistency/mceliece6960119f_avx.yml new file mode 100644 index 00000000..5f7bad61 --- /dev/null +++ b/test/duplicate_consistency/mceliece6960119f_avx.yml @@ -0,0 +1,297 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6960119f_clean.yml b/test/duplicate_consistency/mceliece6960119f_clean.yml new file mode 100644 index 00000000..063d7f52 --- /dev/null +++ b/test/duplicate_consistency/mceliece6960119f_clean.yml @@ -0,0 +1,400 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6960119f_sse.yml b/test/duplicate_consistency/mceliece6960119f_sse.yml new file mode 100644 index 00000000..5f7bad61 --- /dev/null +++ b/test/duplicate_consistency/mceliece6960119f_sse.yml @@ -0,0 +1,297 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece6960119f_vec.yml b/test/duplicate_consistency/mceliece6960119f_vec.yml new file mode 100644 index 00000000..0c36c6d3 --- /dev/null +++ b/test/duplicate_consistency/mceliece6960119f_vec.yml @@ -0,0 +1,378 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.c + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - vec.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.c + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft.h + - fft_tr.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - fft.c + - fft.h + - fft_tr.c + - fft_tr.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece8192128_avx.yml b/test/duplicate_consistency/mceliece8192128_avx.yml new file mode 100644 index 00000000..c5c7f31d --- /dev/null +++ b/test/duplicate_consistency/mceliece8192128_avx.yml @@ -0,0 +1,295 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece8192128_clean.yml b/test/duplicate_consistency/mceliece8192128_clean.yml new file mode 100644 index 00000000..4dc73b0d --- /dev/null +++ b/test/duplicate_consistency/mceliece8192128_clean.yml @@ -0,0 +1,411 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece8192128_sse.yml b/test/duplicate_consistency/mceliece8192128_sse.yml new file mode 100644 index 00000000..c5c7f31d --- /dev/null +++ b/test/duplicate_consistency/mceliece8192128_sse.yml @@ -0,0 +1,295 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece8192128_vec.yml b/test/duplicate_consistency/mceliece8192128_vec.yml new file mode 100644 index 00000000..4f2a98e4 --- /dev/null +++ b/test/duplicate_consistency/mceliece8192128_vec.yml @@ -0,0 +1,366 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.c + - operations.c + - powers.inc + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.c + - operations.c + - powers.inc + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - fft.c + - fft.h + - fft_tr.c + - fft_tr.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - powers.inc + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece8192128f_avx.yml b/test/duplicate_consistency/mceliece8192128f_avx.yml new file mode 100644 index 00000000..1963a14a --- /dev/null +++ b/test/duplicate_consistency/mceliece8192128f_avx.yml @@ -0,0 +1,295 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece8192128f_clean.yml b/test/duplicate_consistency/mceliece8192128f_clean.yml new file mode 100644 index 00000000..36a43135 --- /dev/null +++ b/test/duplicate_consistency/mceliece8192128f_clean.yml @@ -0,0 +1,411 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - pk_gen.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.h + - operations.c + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - root.c + - root.h + - sk_gen.c + - sk_gen.h + - synd.c + - synd.h + - transpose.c + - transpose.h + - util.c + - util.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece8192128f_sse.yml b/test/duplicate_consistency/mceliece8192128f_sse.yml new file mode 100644 index 00000000..1963a14a --- /dev/null +++ b/test/duplicate_consistency/mceliece8192128f_sse.yml @@ -0,0 +1,295 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - sk_gen.c + - sk_gen.h + source: + implementation: vec + scheme: mceliece8192128f diff --git a/test/duplicate_consistency/mceliece8192128f_vec.yml b/test/duplicate_consistency/mceliece8192128f_vec.yml new file mode 100644 index 00000000..84371961 --- /dev/null +++ b/test/duplicate_consistency/mceliece8192128f_vec.yml @@ -0,0 +1,366 @@ +consistency_checks: +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece348864f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece460896f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.c + - operations.c + - powers.inc + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - fft_tr.c + - operations.c + - powers.inc + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.h + source: + implementation: vec + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6688128f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119 +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - pk_gen.h + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece6960119f +- files: + - aes256ctr.c + - aes256ctr.h + - benes.c + - benes.h + - bm.c + - bm.h + - consts.inc + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.c + - decrypt.h + - encrypt.c + - encrypt.h + - fft.c + - fft.h + - fft_tr.c + - fft_tr.h + - gf.c + - gf.h + - operations.c + - params.h + - pk_gen.h + - powers.inc + - scalars_2x.inc + - scalars_4x.inc + - sk_gen.c + - sk_gen.h + - transpose.c + - transpose.h + - util.c + - util.h + - vec.c + - vec.h + source: + implementation: vec + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128 +- files: + - aes256ctr.c + - aes256ctr.h + - api.h + - controlbits.c + - controlbits.h + - crypto_hash.h + - decrypt.h + - encrypt.h + - operations.c + - params.h + - sk_gen.c + - sk_gen.h + source: + implementation: clean + scheme: mceliece8192128f diff --git a/test/helpers.py b/test/helpers.py index f92764dd..52a587d0 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -274,3 +274,27 @@ def filtered_test(func): else: raise unittest.SkipTest("Test disabled by filter") return wrapper + + +__CPUINFO = None + + +def get_cpu_info(): + global __CPUINFO + while __CPUINFO is None or 'flags' not in __CPUINFO: + import cpuinfo + __CPUINFO = cpuinfo.get_cpu_info() + + # CPUINFO is unreliable on Travis CI Macs + if 'CI' in os.environ and sys.platform == 'darwin': + __CPUINFO['flags'] = [ + 'aes', 'apic', 'avx1.0', 'clfsh', 'cmov', 'cx16', 'cx8', 'de', + 'em64t', 'erms', 'f16c', 'fpu', 'fxsr', 'lahf', 'mca', 'mce', + 'mmx', 'mon', 'msr', 'mtrr', 'osxsave', 'pae', 'pat', 'pcid', + 'pclmulqdq', 'pge', 'popcnt', 'pse', 'pse36', 'rdrand', + 'rdtscp', 'rdwrfsgs', 'sep', 'smep', 'ss', 'sse', 'sse2', + 'sse3', 'sse4.1', 'sse4.2', 'ssse3', 'syscall', 'tsc', + 'tsc_thread_offset', 'tsci', 'tsctmr', 'vme', 'vmm', 'x2apic', + 'xd', 'xsave'] + + return __CPUINFO diff --git a/test/pqclean.py b/test/pqclean.py index 16da66fb..8713b5df 100644 --- a/test/pqclean.py +++ b/test/pqclean.py @@ -4,6 +4,7 @@ from typing import Optional import yaml import platform +import helpers class Scheme: @@ -152,19 +153,15 @@ class Implementation: if not self.supported_on_os(): return False - if not hasattr(Implementation, 'CPUINFO'): - import cpuinfo - Implementation.CPUINFO = cpuinfo.get_cpu_info() - - CPUINFO = Implementation.CPUINFO + cpuinfo = helpers.get_cpu_info() for platform_ in self.metadata()['supported_platforms']: - if platform_['architecture'] == CPUINFO['arch'].lower(): + if platform_['architecture'] == cpuinfo['arch'].lower(): # Detect actually running on emulated i386 if (platform_['architecture'] == 'x86_64' and platform.architecture()[0] == '32bit'): continue - if all([flag in CPUINFO['flags'] + if all([flag in cpuinfo['flags'] for flag in platform_['required_flags']]): return True return False diff --git a/test/test_duplicate_consistency.py b/test/test_duplicate_consistency.py index 6dbd4f60..a6e0763f 100644 --- a/test/test_duplicate_consistency.py +++ b/test/test_duplicate_consistency.py @@ -11,6 +11,8 @@ import yaml import helpers import pqclean +sys.tracebacklimit = 0 + def pytest_generate_tests(metafunc): ids = [] @@ -28,44 +30,48 @@ def pytest_generate_tests(metafunc): metadata = yaml.safe_load(f.read()) for group in metadata['consistency_checks']: source = pqclean.Implementation.by_name( - group['source']['scheme'], - group['source']['implementation']) - for file in group['files']: - argvalues.append((implementation, source, file)) - ids.append( - "{scheme.name} {implementation.name} {source.scheme.name}: {file}" - .format(scheme=scheme, source=source, - implementation=implementation, - file=file)) - metafunc.parametrize(('implementation', 'source', 'file'), + group['source']['scheme'], + group['source']['implementation']) + argvalues.append( + (implementation, source, group['files'])) + ids.append( + "{metafile}: {scheme.name} {implementation.name}" + .format(scheme=scheme, + implementation=implementation, + metafile=metafile)) + metafunc.parametrize(('implementation', 'source', 'files'), argvalues, ids=ids) def file_get_contents(filename): - with open(filename) as f: - return f.read() + with open(filename) as file: + return file.read() @helpers.filtered_test -def test_duplicate_consistency(implementation, source, file): - target_path = os.path.join(source.path(), file) - this_path = os.path.join(implementation.path(), file) - target_src = file_get_contents(target_path) - this_src = file_get_contents(this_path) - this_transformed_src = this_src.replace( - implementation.namespace_prefix(), '') - target_transformed_src = target_src.replace(source.namespace_prefix(), '') +def test_duplicate_consistency(implementation, source, files): + """Test sets of files to be identical modulo namespacing""" + messages = [] + for file in files: + target_path = os.path.join(source.path(), file) + this_path = os.path.join(implementation.path(), file) + target_src = file_get_contents(target_path) + this_src = file_get_contents(this_path) + this_transformed_src = this_src.replace( + implementation.namespace_prefix(), '') + target_transformed_src = target_src.replace( + source.namespace_prefix(), '') - if not this_transformed_src == target_transformed_src: - diff = difflib.unified_diff( + if not this_transformed_src == target_transformed_src: + diff = difflib.unified_diff( this_transformed_src.splitlines(keepends=True), target_transformed_src.splitlines(keepends=True), fromfile=this_path, tofile=target_path) - raise AssertionError( - "Files differed:\n" - + ''.join(diff)) + messages.append("{} differed:\n{}".format(file, ''.join(diff))) + if messages: + raise AssertionError("Files differed:\n{}".format('\n'.join(messages))) if __name__ == '__main__':